IMPORTAMOS LAS LIBRERÍAS

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.impute import KNNImputer
import scipy.stats as ss
import warnings
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

FUNCIONES

In [2]:
def plot_feature(df, col_name, isContinuous, target):
    f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
    
    count_null = df[col_name].isnull().sum()
    if isContinuous:
        
        sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
    else:
        sns.countplot(df[col_name], order=sorted(df[col_name].unique()), color='#5975A4', saturation=1, ax=ax1)
    ax1.set_xlabel(col_name)
    ax1.set_ylabel('Count')
    ax1.set_title(col_name+ ' Numero de nulos: '+str(count_null))
    plt.xticks(rotation = 90)


    if isContinuous:
        sns.boxplot(x=col_name, y=target, data=df, ax=ax2)
        ax2.set_ylabel('')
        ax2.set_title(col_name + ' by '+target)
    else:
        data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index() 
        data.columns = [i, target, 'proportion']
        #sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
        sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
        ax2.set_ylabel(target+' fraction')
        ax2.set_title(target)
        plt.xticks(rotation = 90)
    ax2.set_xlabel(col_name)
    
    plt.tight_layout()

def dame_variables_categoricas(dataset=None):
    if dataset is None:
        print(u'\nFaltan argumentos por pasar a la función')
        return 1
    lista_variables_categoricas = []
    other = []

    for i in dataset.columns:

        if dataset[i].dtype == object:
            unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
            if unicos < 100:
                lista_variables_categoricas.append(i)
            else:
                other.append(i)
        
        if dataset[i].dtype == int:
            unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
            if unicos < 10:
                lista_variables_categoricas.append(i)
            else:
                other.append(i)

    return lista_variables_categoricas, other

def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
    # Para obtener la correlación de Spearman, sólo cambiar el metodo por 'spearman'

    if dataset is None:
        print(u'\nHace falta pasar argumentos a la función')
        return 1
    sns.set(style="white")
    # Compute the correlation matrix
    corr = dataset.corr(method=metodo) 
    # Set self-correlation to zero to avoid distraction
    for i in range(corr.shape[0]):
        corr.iloc[i, i] = 0
    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=size_figure)
    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, center=0,
                square=True, linewidths=.5,  cmap ='viridis' ) #cbar_kws={"shrink": .5}
    plt.show()
    
    return 0

def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
    pd_final = pd.DataFrame()
    
    for i in list_var_continuous:
        
        series_mean = pd_loan[i].mean()
        series_std = pd_loan[i].std()
        std_amp = multiplier * series_std
        left = series_mean - std_amp
        right = series_mean + std_amp
        size_s = pd_loan[i].size
        
        perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
        perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s
        
        if perc_excess>0:    
            pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)]\
                                            .value_counts(normalize=True).reset_index()).T
            pd_concat_percent.columns = [pd_concat_percent.iloc[0,0], 
                                         pd_concat_percent.iloc[0,1]]
            pd_concat_percent = pd_concat_percent.drop('TARGET',axis=0)
            pd_concat_percent['variable'] = i
            pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
            pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
            pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
            
    if pd_final.empty:
        print('No existen variables con valores nulos')
        
    return pd_final

def get_percent_null_values_target(pd_loan, list_var_continuous, target):

    pd_final = pd.DataFrame()
    for i in list_var_continuous:
        if pd_loan[i].isnull().sum()>0:
            pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i].isnull()]\
                                            .value_counts(normalize=True).reset_index()).T
            pd_concat_percent.columns = [pd_concat_percent.iloc[0,0], 
                                         pd_concat_percent.iloc[0,1]]
            pd_concat_percent = pd_concat_percent.drop('TARGET',axis=0)
            pd_concat_percent['variable'] = i
            pd_concat_percent['sum_null_values'] = pd_loan[i].isnull().sum()
            pd_concat_percent['porcentaje_sum_null_values'] = pd_loan[i].isnull().sum()/pd_loan.shape[0]
            pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
            
    if pd_final.empty:
        print('No existen variables con valores nulos')
        
    return pd_final

def cramers_v(confusion_matrix):
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

Con estas funciones estamos tratando de hacer varias cosas:

  1. plot_feature: va a generar gráficos específicos para cada variable donde hará un histograma para analizar la distribución y boxplot para ver cómo se distribuyen los valores en base a la variable TARGET para las variables contínuas. Para las variables categóricas va a crear dos gráficos de barras, para la frecuencia de la variable y otro con las proporciones por categoría de TARGET

  2. dame_variables_categoricas: trata de identificar variables categóricas y separarlas en dos: aquellas variables que tienen menos de 100 valores únicos (estas pueden ser muy útiles en nuestro análisis), y aquellas que tienen más de 100 registros únicos

  3. get_corr_matrix: para visualizar la matriz de correlación para variables numéricas

  4. get_deviation_of_mean_perc: crea un rango de valores alrededor de la media con la desviación típica y encuentra la proporción de valores dentro y fuera de este intervalo, para así poder ver los valores atípicos y la relación que tienen con la variable TARGET.

  5. get_percent_values_target: es para analizar la relación que hay entre valores nulos y la variable TARGET.

  6. cramers_v: para ver la relación entre variables categóricas usando la estadística de Cramer

LECTURA DE DATOS DEL PREPROCESADO INICIAL

In [3]:
path_folder = "../data/"
pd_loan = pd.read_csv(path_folder + "application_data.csv", low_memory = False).set_index('SK_ID_CURR')

pd_loan.head()
Out[3]:
TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL AMT_CREDIT AMT_ANNUITY AMT_GOODS_PRICE NAME_TYPE_SUITE NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE REGION_POPULATION_RELATIVE DAYS_BIRTH DAYS_EMPLOYED DAYS_REGISTRATION DAYS_ID_PUBLISH OWN_CAR_AGE FLAG_MOBIL FLAG_EMP_PHONE FLAG_WORK_PHONE FLAG_CONT_MOBILE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS REGION_RATING_CLIENT REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START HOUR_APPR_PROCESS_START REG_REGION_NOT_LIVE_REGION REG_REGION_NOT_WORK_REGION LIVE_REGION_NOT_WORK_REGION REG_CITY_NOT_LIVE_CITY REG_CITY_NOT_WORK_CITY LIVE_CITY_NOT_WORK_CITY ORGANIZATION_TYPE EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3 APARTMENTS_AVG BASEMENTAREA_AVG YEARS_BEGINEXPLUATATION_AVG YEARS_BUILD_AVG COMMONAREA_AVG ELEVATORS_AVG ENTRANCES_AVG FLOORSMAX_AVG FLOORSMIN_AVG LANDAREA_AVG LIVINGAPARTMENTS_AVG LIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG NONLIVINGAREA_AVG APARTMENTS_MODE BASEMENTAREA_MODE YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_MODE COMMONAREA_MODE ELEVATORS_MODE ENTRANCES_MODE FLOORSMAX_MODE FLOORSMIN_MODE LANDAREA_MODE LIVINGAPARTMENTS_MODE LIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_MODE APARTMENTS_MEDI BASEMENTAREA_MEDI YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MEDI COMMONAREA_MEDI ELEVATORS_MEDI ENTRANCES_MEDI FLOORSMAX_MEDI FLOORSMIN_MEDI LANDAREA_MEDI LIVINGAPARTMENTS_MEDI LIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MEDI FONDKAPREMONT_MODE HOUSETYPE_MODE TOTALAREA_MODE WALLSMATERIAL_MODE EMERGENCYSTATE_MODE OBS_30_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE OBS_60_CNT_SOCIAL_CIRCLE DEF_60_CNT_SOCIAL_CIRCLE DAYS_LAST_PHONE_CHANGE FLAG_DOCUMENT_2 FLAG_DOCUMENT_3 FLAG_DOCUMENT_4 FLAG_DOCUMENT_5 FLAG_DOCUMENT_6 FLAG_DOCUMENT_7 FLAG_DOCUMENT_8 FLAG_DOCUMENT_9 FLAG_DOCUMENT_10 FLAG_DOCUMENT_11 FLAG_DOCUMENT_12 FLAG_DOCUMENT_13 FLAG_DOCUMENT_14 FLAG_DOCUMENT_15 FLAG_DOCUMENT_16 FLAG_DOCUMENT_17 FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUMENT_21 AMT_REQ_CREDIT_BUREAU_HOUR AMT_REQ_CREDIT_BUREAU_DAY AMT_REQ_CREDIT_BUREAU_WEEK AMT_REQ_CREDIT_BUREAU_MON AMT_REQ_CREDIT_BUREAU_QRT AMT_REQ_CREDIT_BUREAU_YEAR
SK_ID_CURR
100002 1 Cash loans M N Y 0 202500.0 406597.5 24700.5 351000.0 Unaccompanied Working Secondary / secondary special Single / not married House / apartment 0.018801 -9461 -637 -3648.0 -2120 NaN 1 1 0 1 1 0 Laborers 1.0 2 2 WEDNESDAY 10 0 0 0 0 0 0 Business Entity Type 3 0.083037 0.262949 0.139376 0.0247 0.0369 0.9722 0.6192 0.0143 0.00 0.0690 0.0833 0.1250 0.0369 0.0202 0.0190 0.0000 0.0000 0.0252 0.0383 0.9722 0.6341 0.0144 0.0000 0.0690 0.0833 0.1250 0.0377 0.022 0.0198 0.0 0.0 0.0250 0.0369 0.9722 0.6243 0.0144 0.00 0.0690 0.0833 0.1250 0.0375 0.0205 0.0193 0.0000 0.00 reg oper account block of flats 0.0149 Stone, brick No 2.0 2.0 2.0 2.0 -1134.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 1.0
100003 0 Cash loans F N N 0 270000.0 1293502.5 35698.5 1129500.0 Family State servant Higher education Married House / apartment 0.003541 -16765 -1188 -1186.0 -291 NaN 1 1 0 1 1 0 Core staff 2.0 1 1 MONDAY 11 0 0 0 0 0 0 School 0.311267 0.622246 NaN 0.0959 0.0529 0.9851 0.7960 0.0605 0.08 0.0345 0.2917 0.3333 0.0130 0.0773 0.0549 0.0039 0.0098 0.0924 0.0538 0.9851 0.8040 0.0497 0.0806 0.0345 0.2917 0.3333 0.0128 0.079 0.0554 0.0 0.0 0.0968 0.0529 0.9851 0.7987 0.0608 0.08 0.0345 0.2917 0.3333 0.0132 0.0787 0.0558 0.0039 0.01 reg oper account block of flats 0.0714 Block No 1.0 0.0 1.0 0.0 -828.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
100004 0 Revolving loans M Y Y 0 67500.0 135000.0 6750.0 135000.0 Unaccompanied Working Secondary / secondary special Single / not married House / apartment 0.010032 -19046 -225 -4260.0 -2531 26.0 1 1 1 1 1 0 Laborers 1.0 2 2 MONDAY 9 0 0 0 0 0 0 Government NaN 0.555912 0.729567 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 -815.0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
100006 0 Cash loans F N Y 0 135000.0 312682.5 29686.5 297000.0 Unaccompanied Working Secondary / secondary special Civil marriage House / apartment 0.008019 -19005 -3039 -9833.0 -2437 NaN 1 1 0 1 0 0 Laborers 2.0 2 2 WEDNESDAY 17 0 0 0 0 0 0 Business Entity Type 3 NaN 0.650442 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 0.0 2.0 0.0 -617.0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN
100007 0 Cash loans M N Y 0 121500.0 513000.0 21865.5 513000.0 Unaccompanied Working Secondary / secondary special Single / not married House / apartment 0.028663 -19932 -3038 -4311.0 -3458 NaN 1 1 0 1 0 0 Core staff 1.0 2 2 THURSDAY 11 0 0 0 0 1 1 Religion NaN 0.322738 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 -1106.0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0

Utilizamos el ID de cliente como el índice del Dataframe. A continuación, identificamos las variables continuas y categóricas dentro del conjunto de datos

In [4]:
def dame_variables_categoricas(dataset=None):
    '''
    ----------------------------------------------------------------------------------------------------------
    Función dame_variables_categoricas:
    ----------------------------------------------------------------------------------------------------------
    - Descripción:
        Esta función recibe un DataFrame una lista de las variables categóricas 
        (con pocos valores únicos).
        
    - Inputs: 
        - dataset (DataFrame): DataFrame que contiene los datos de entrada.
        
    - Return:
        - lista_variables_categoricas (list): Lista con los nombres de las variables 
          categóricas en el DataFrame.
        - other (list): Lista con los nombres de las variables que no cumplen los criterios 
          para ser categóricas.
        - 1: Indica que la ejecución es incorrecta debido a la falta del 
          argumento 'dataset'.
    '''
    # Verificar que el DataFrame de entrada no sea nulo
    if dataset is None:
        print(u'\nError: Falta el argumento dataset en la función')
        return 1 
    
    lista_variables_categoricas = []  
    other = []  

    # Recorrer las columnas del DataFrame
    for i in dataset.columns:
        
        # Si la columna es de tipo objeto (posiblemente categórica)
        if dataset[i].dtype == object:
            unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
            if unicos < 100:
                lista_variables_categoricas.append(i)  
            else:
                other.append(i)  
                
        # Si la columna es de tipo entero                
        if dataset[i].dtype == int:
            unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
            if unicos < 10:
                lista_variables_categoricas.append(i)  
            else:
                other.append(i) 

    return lista_variables_categoricas, other
In [5]:
import numpy as np
#Aquí identificamos las variables categóricas
list_var_cat, other = dame_variables_categoricas(dataset=pd_loan)
pd_loan[list_var_cat] = pd_loan[list_var_cat].astype("category")

#Seleccionamos las columnas que tienen datos numéricos continuos
list_var_continuous = list(pd_loan.select_dtypes('float').columns)
pd_loan[list_var_continuous] = pd_loan[list_var_continuous].astype(float)
pd_loan.dtypes
Out[5]:
TARGET                             int64
NAME_CONTRACT_TYPE              category
CODE_GENDER                     category
FLAG_OWN_CAR                    category
FLAG_OWN_REALTY                 category
CNT_CHILDREN                       int64
AMT_INCOME_TOTAL                 float64
AMT_CREDIT                       float64
AMT_ANNUITY                      float64
AMT_GOODS_PRICE                  float64
NAME_TYPE_SUITE                 category
NAME_INCOME_TYPE                category
NAME_EDUCATION_TYPE             category
NAME_FAMILY_STATUS              category
NAME_HOUSING_TYPE               category
REGION_POPULATION_RELATIVE       float64
DAYS_BIRTH                         int64
DAYS_EMPLOYED                      int64
DAYS_REGISTRATION                float64
DAYS_ID_PUBLISH                    int64
OWN_CAR_AGE                      float64
FLAG_MOBIL                         int64
FLAG_EMP_PHONE                     int64
FLAG_WORK_PHONE                    int64
FLAG_CONT_MOBILE                   int64
FLAG_PHONE                         int64
FLAG_EMAIL                         int64
OCCUPATION_TYPE                 category
CNT_FAM_MEMBERS                  float64
REGION_RATING_CLIENT               int64
REGION_RATING_CLIENT_W_CITY        int64
WEEKDAY_APPR_PROCESS_START      category
HOUR_APPR_PROCESS_START            int64
REG_REGION_NOT_LIVE_REGION         int64
REG_REGION_NOT_WORK_REGION         int64
LIVE_REGION_NOT_WORK_REGION        int64
REG_CITY_NOT_LIVE_CITY             int64
REG_CITY_NOT_WORK_CITY             int64
LIVE_CITY_NOT_WORK_CITY            int64
ORGANIZATION_TYPE               category
EXT_SOURCE_1                     float64
EXT_SOURCE_2                     float64
EXT_SOURCE_3                     float64
APARTMENTS_AVG                   float64
BASEMENTAREA_AVG                 float64
YEARS_BEGINEXPLUATATION_AVG      float64
YEARS_BUILD_AVG                  float64
COMMONAREA_AVG                   float64
ELEVATORS_AVG                    float64
ENTRANCES_AVG                    float64
FLOORSMAX_AVG                    float64
FLOORSMIN_AVG                    float64
LANDAREA_AVG                     float64
LIVINGAPARTMENTS_AVG             float64
LIVINGAREA_AVG                   float64
NONLIVINGAPARTMENTS_AVG          float64
NONLIVINGAREA_AVG                float64
APARTMENTS_MODE                  float64
BASEMENTAREA_MODE                float64
YEARS_BEGINEXPLUATATION_MODE     float64
YEARS_BUILD_MODE                 float64
COMMONAREA_MODE                  float64
ELEVATORS_MODE                   float64
ENTRANCES_MODE                   float64
FLOORSMAX_MODE                   float64
FLOORSMIN_MODE                   float64
LANDAREA_MODE                    float64
LIVINGAPARTMENTS_MODE            float64
LIVINGAREA_MODE                  float64
NONLIVINGAPARTMENTS_MODE         float64
NONLIVINGAREA_MODE               float64
APARTMENTS_MEDI                  float64
BASEMENTAREA_MEDI                float64
YEARS_BEGINEXPLUATATION_MEDI     float64
YEARS_BUILD_MEDI                 float64
COMMONAREA_MEDI                  float64
ELEVATORS_MEDI                   float64
ENTRANCES_MEDI                   float64
FLOORSMAX_MEDI                   float64
FLOORSMIN_MEDI                   float64
LANDAREA_MEDI                    float64
LIVINGAPARTMENTS_MEDI            float64
LIVINGAREA_MEDI                  float64
NONLIVINGAPARTMENTS_MEDI         float64
NONLIVINGAREA_MEDI               float64
FONDKAPREMONT_MODE              category
HOUSETYPE_MODE                  category
TOTALAREA_MODE                   float64
WALLSMATERIAL_MODE              category
EMERGENCYSTATE_MODE             category
OBS_30_CNT_SOCIAL_CIRCLE         float64
DEF_30_CNT_SOCIAL_CIRCLE         float64
OBS_60_CNT_SOCIAL_CIRCLE         float64
DEF_60_CNT_SOCIAL_CIRCLE         float64
DAYS_LAST_PHONE_CHANGE           float64
FLAG_DOCUMENT_2                    int64
FLAG_DOCUMENT_3                    int64
FLAG_DOCUMENT_4                    int64
FLAG_DOCUMENT_5                    int64
FLAG_DOCUMENT_6                    int64
FLAG_DOCUMENT_7                    int64
FLAG_DOCUMENT_8                    int64
FLAG_DOCUMENT_9                    int64
FLAG_DOCUMENT_10                   int64
FLAG_DOCUMENT_11                   int64
FLAG_DOCUMENT_12                   int64
FLAG_DOCUMENT_13                   int64
FLAG_DOCUMENT_14                   int64
FLAG_DOCUMENT_15                   int64
FLAG_DOCUMENT_16                   int64
FLAG_DOCUMENT_17                   int64
FLAG_DOCUMENT_18                   int64
FLAG_DOCUMENT_19                   int64
FLAG_DOCUMENT_20                   int64
FLAG_DOCUMENT_21                   int64
AMT_REQ_CREDIT_BUREAU_HOUR       float64
AMT_REQ_CREDIT_BUREAU_DAY        float64
AMT_REQ_CREDIT_BUREAU_WEEK       float64
AMT_REQ_CREDIT_BUREAU_MON        float64
AMT_REQ_CREDIT_BUREAU_QRT        float64
AMT_REQ_CREDIT_BUREAU_YEAR       float64
dtype: object

Algunas variables clasificadas inicialmente como enteras son de carácter booleano, lo que las hace categóricas.

SEPARACIÓN EN TRAIN Y TEST ESTRATIFICADO

In [6]:
pd_plot_target = pd_loan['TARGET'].value_counts(normalize=False).reset_index()

pd_plot_target.columns = ['TARGET', 'count']

fig = px.bar(pd_plot_target, x='TARGET', y='count')
fig.update_xaxes(tickvals=[0, 1])
fig.show()

Se observa que el muestreo está desbalanceado, ya que la mayoría de los clientes no tienen dificultades para pagar el préstamo. (0 = pagado y 1 = no pagado) El 20% de los datos se asignan al conjunto de prueba y el 80% al conjunto de entrenamiento para entrenar el modelo.

In [7]:
from sklearn.model_selection import train_test_split
X_pd_loan, X_pd_loan_test, y_pd_loan, y_pd_loan_test = train_test_split(pd_loan.drop('TARGET',axis=1), 
                                                                     pd_loan['TARGET'], 
                                                                     stratify=pd_loan['TARGET'], 
                                                                     test_size=0.2)
pd_loan_train = pd.concat([X_pd_loan, y_pd_loan],axis=1)
pd_loan_test = pd.concat([X_pd_loan_test, y_pd_loan_test],axis=1)
In [8]:
print('== Train\n', pd_loan_train['TARGET'].value_counts(normalize=True))
print('== Test\n', pd_loan_test['TARGET'].value_counts(normalize=True))
== Train
 TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64
== Test
 TARGET
0    0.919272
1    0.080728
Name: proportion, dtype: float64

Como observamos en el resultado, cada categoría de la variable target tiene la misma proporción tanto en el train como en el test, lo que permite que el modelo tenga ejemplos de todas las clases en ambos conjuntos y asi evita que el modelo tenga un sesgo hacia una clase en particular.

VISUALIZACIÓN DESCRIPTIVA DE LOS DATOS

Aquí vamos a realizar la suma de la cantidad de valores nulos que hay por filas y por columnas.

pd_null_columnas contiene el número de valores nulos por columna y el % de valores nulos respecto al total de filas. pd_null_filas contiene el número de valores nulos por filas y el porcentaje de valores nulos respecto al total de columnas, y la relación con la variable TARGET

In [9]:
#Veo el número de valores nulos por filas y columnas
pd_series_null_columns = pd_loan_train.isnull().sum().sort_values(ascending=False)
pd_series_null_rows = pd_loan_train.isnull().sum(axis=1).sort_values(ascending=False)
print(pd_series_null_columns.shape, pd_series_null_rows.shape)

pd_null_columnas = pd.DataFrame(pd_series_null_columns, columns=['nulos_columnas'])     
pd_null_filas = pd.DataFrame(pd_series_null_rows, columns=['nulos_filas'])  
pd_null_filas['TARGET'] = pd_loan['TARGET'].copy()
pd_null_columnas['porcentaje_columnas'] = pd_null_columnas['nulos_columnas']/pd_loan_train.shape[0]
pd_null_filas['porcentaje_filas']= pd_null_filas['nulos_filas']/pd_loan_train.shape[1]
(121,) (246008,)
In [10]:
pd_null_columnas.head()
Out[10]:
nulos_columnas porcentaje_columnas
COMMONAREA_MODE 171857 0.698583
COMMONAREA_MEDI 171857 0.698583
COMMONAREA_AVG 171857 0.698583
NONLIVINGAPARTMENTS_MEDI 170757 0.694112
NONLIVINGAPARTMENTS_MODE 170757 0.694112
In [11]:
pd_null_filas.head()
Out[11]:
nulos_filas TARGET porcentaje_filas
SK_ID_CURR
255145 61 0 0.504132
412671 61 0 0.504132
274127 61 0 0.504132
235599 61 0 0.504132
329262 61 0 0.504132

Hemos hecho un conteo y porcentaje de valores nulos por filas y columnas para identificar qué columnas y filas tienen más valores nulos por si hubiese que tomar la decisión de imputarlas o eliminarlas

In [12]:
list_var_cat
Out[12]:
['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE']
In [13]:
#Realizamos un bucle para las variables categóricas

for variable in list_var_cat:
    if variable != "TARGET":
        fig, axes = plt.subplots(1, 2, figsize=(12, 6))

        # Gráfico de conteo
        sns.countplot(data=pd_loan_train, x=variable, ax=axes[0])
        axes[0].set_title(f'{variable} Número de nulos: {pd_loan_train[variable].isna().sum()}')
        axes[0].set_ylabel('Count')

        # Rotar etiquetas del eje X
        axes[0].tick_params(axis='x', rotation=45)

        # Para cada categoría: gráfico de fracciones
        pd_loan_df = pd_loan_train.groupby([variable, 'TARGET']).size().reset_index(name='count')
        pd_loan_df['fraction'] = pd_loan_df['count'] / pd_loan_df.groupby(variable)['count'].transform('sum')

        sns.barplot(data=pd_loan_df, x=variable, y='fraction', hue='TARGET', ax=axes[1])
        axes[1].set_title(f'{variable} vs TARGET')
        axes[1].set_ylabel('TARGET fraction')

        # Rotar etiquetas del eje X
        axes[1].tick_params(axis='x', rotation=45)

        # Ajustar diseño
        plt.tight_layout()
        plt.show()
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:16: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\1351463941.py:17: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

No description has been provided for this image

A partir de estos gráficos, podemos ver a la izquierda el número de valores nulos para cada categoría de las variables, y en el de la derecha, cómo se distribuye una variable en función de la variable TARGET. Por lo general, vemos que los datos están bastante desbalanceados, pues hay muchas variables que no vemos que tengan un impacto fuerte en el impago del préstamo. A continuación, vamos a comentar algunas variables que puedan ser significativas en el impago del préstamo.

NAME_INCOME_TYPE (Situación laboral del solicitante) En el gráfico de la izquierda vemos que la mayoría de los datos se asocian con 'Working', es decir, personas que tienen un empleo. Otras categorías presentan también una parte de los datos, como 'Pensioner' o 'Commercial Associate'. Otras categorías como 'Student' o 'Unemployed' presentan muy poca distribución de los datos, o ninguna. En el gráfico de la derecha, que muestra la fracción de incumplimiento para cada categoría, vemos que las personas desempleadas o las que están de baja por maternidad, tienen un porcentaje alto de incumplimiento. Los desempleados parece lógico, pues se asocia con la falta de un trabajo estable. Otras categorías como los pensionistas o los trabajadores presentan un grado bajo de incumplimiento, pues están asociados con un salario o pensión.

NAME_EDUCATION_TYPE(Nivel de educación) En el gráfico de la izquierda vemos que los datos se distribuyen sobre todo en clientes que tienen 'Secondary/Secondary Special', seguidos de 'Higher Education'. 'Academid degree' o 'Lower Secundary' son menos comunes.En el gráfico de la derecha se muestra las fracciones de incumplimiento para cada nivel educativo. Como vemos, secundaria, secundaria especial o secundaria baja son los niveles que más probabilidad de incumplimiento tienen, algo que puede estar relacionado por el tipo de empleo que estos clientes tienen, asociado con la baja formación académica recibida. Los clientes con un grado o educación superior presentan menor riesgo de incumplimiento, algo que también puede estar relacionado por el caso contrario al anterior.Podemos concluir que el nivel educativo influye en la capacidad de incumplimiento del solicitante.

NAME_HOUSING_TYPE(Tipo de vivienda) En el gráfico de la izquierda vemos que la mayoría de los datos se asocian con clientes que vivene un apartameto o casa, que por otra parte es lo más normal. Otras categorías presentan una distribución de los datos mucho menor, como 'Municipal Apartment', 'With Parents' o 'Rented Apartment'. En cuanto a su relación con TARGET, vemos que la fracción de incumplimineto es algo mayor para aquellos solicitantes que viven con los padres o de alquiler. Esto puede estar relacionado con un menor grado de cumplimineto, pues los que viven con sus padres es posible que no tengan una ocupación, aunque también pueden ser aún jóvenes para independizarse. Aun así, no parece una variable muy significativa en cuanto al grado de incumplimiento del préstamo.

OCCUPATYON_TYPE (Tipo de trabajo) Al observar la variable, vemos que presenta una buena cantidad de nulos que podríamos imputar para mejorar el análisis. En el gráfico de la izquierda, vemos que la mayoría de los solicitantes pertenecen a 'Laborers'(en mayor proporción), 'Core Staff' y 'Sales Staff'. Como vemos en el gráfico de la derecha, los trabajadores dedicados a actividades manuales o físicas como la construcción, fabricación, agaricultura, así como aquellos con una baja especialización, como por ejemplo, ayudante de obra, presentan mayor incumplimineto de devolver el préstamo. Otros gremios, como los conductores o los vigilantes de seguridad presentan un grado de incumplimiento menor, aunque mayor que el resto. Esto puede estar debido a la formación educativa, ya que estos trabajos no requieren de una cualificación para dedicarse a ellos. Podemos concluir que el tipo de ocupación puede influir en el grado de incumplimiento del préstamo.

CODE_GENDER(Género) Como vemos en el gráfico de la izquierda, hay mayor proporción de clientes femeninos que masculinos.En el gráfico de la derecha no parece haber una diferencia extremadamente significativa entre el grado de incumplimineto de hombres y mujeres, aunque si nos fijamos el grado de incumplimiento mayor es ligeramente masculino. Esto puede deberse a muchas razones como el tipo de trabajo, mentalidad o tipo de educación, entre otras. En conclusión, esta variable tiene una relación moderada con la probabilidad de incumplkimiento, siendo importante considerar cómoa actúa con otras variables.

NAME_CONTRACT_TYPE (Tipo de contrato) En el gráfico de la izquierda vemos que la mayoría de los préstamos pertenecen a 'Cash Loans', mientras que los 'Revolving Loans' tienen una proproción mucho menor. Con respecto a la variable TARGET, vemos que no hay una gran diferencia en la probabilidad de incumplimiento entre ambos préstamos, pero sí que es verdad que los 'Cash Loans' tienen una probabilidad mayor, cosa que tiene sentido al ser mucho más comunes en el día a día.

In [14]:
for variable in list_var_continuous:  # Lista de variables continuas
    if variable != "TARGET":
        fig, axes = plt.subplots(1, 2, figsize=(12, 6))

        # Histograma
        sns.histplot(data=pd_loan_train, x=variable, kde=True, ax=axes[0], bins=30)
        axes[0].set_title(f'{variable} Número de nulos: {pd_loan_train[variable].isna().sum()}')
        axes[0].set_ylabel('Frequency')

        # Boxplot vs TARGET
        sns.boxplot(data=pd_loan_train, x='TARGET', y=variable, ax=axes[1])
        axes[1].set_title(f'{variable} vs TARGET')
        axes[1].set_ylabel(variable)
        axes[1].set_xlabel('TARGET')

        # Ajustar diseño
        plt.tight_layout()
        plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Una vez realizados los gráficos de las variables continuas con la variable objetivo (TARGET), procederemos a realizar algunos comentarios de algunas variables que puedan influir en la misma, así como algunas que no sean tan significativas para tener un ejemplo de todo. A continuación, veremos cuáles son las variables elegidas.

OWN_CAR_AGE (Edad del coche propio). Como vemos en el gráfico de la izquierda, hay un número considerable de valores nulos, concretamente 162.426, lo que indica que muchas personas no poseen automóvil o no hay información a cerca de ellos. Esto se debería gestionar mediante, por ejemplo, con la imputación o creando una categoría específica para los que no tienen automóvil. La mayoría de los coches tienen entre 0 y 20 años, con una concentración importante sobre todo de 0 a 10. No hay casi valores más allá de los 40 años, solo unos pocos datos sobrepasan los 60 años. En cuanto a su relación con TARGET, vemos en los boxplots que no hay grandes diferencias en la mediana ni en el rango intercuartílico para 0 y para 1, lo que nos puede indicar que no el coche propio puede no ser un factor influyente en cuanto a devolver o no el crédito.

EXT_SOURCE_1 (Puntuaciones de riesgo crediticio para los solicitantes que provienen de fuentes externas). En el gráfico de la izquierda se puede observar que la variable sigue una distribución simétrica con forma similar a una distribución normal. Esta variable tiene una elevada cantidad de valores lo que indica que puede ser útil para discriminar entre distintos tipos de solicitantes. En cuanto a su relación con TARGET, observamos que el boxplot muestra que la mediana para los clientes que no incumplen (TARGET=0) es algo mayor que la de los que sí incumplen (TARGET=1) tienden a tener valores más altos, lo que puede decir que esta variable está asociada con un menor riesgo de incumplimiento.El rango intercuartílico es algo mayor también en los clientes cumplidores que en los que no. Podemos decir que esta variable puede tener un cierto poder predictivo.Los valores altos de EXT_SOURCE_1 se asocian con un menor riesgo de incumplimiento.

CNT_FAM_MEMBERS (Numero de miembros de la familia). En el gráfico de la izquierda vemos que la variable posee 2 valores nulos, algo que no es significativo dentro del total de datos. En el histograma se muestra que la mayoría de los datos se encuentran entre 2 y 4 miembros, concentrándose sobre todo en 2, sugiriendo que una gran parte de las familias son pequeñas. En relación con TARGET, el boxplot muestra que la distribución es similiar tanto para los cumplidores como para los que no. Tanto la mediana como el rango intercuartílico son iguales, lo que indica que no hay una gran relación con la probabilidad de impago. Hay algunas familias con un número elevado de miembros, aunque no parece que tengan un efecto significativo.

AMT_INCOME_TOTAL (Ingresos totales). Esta variable no posee valores nulos, por lo que podría ser representativa. En cuanto al histograma, la distribución de la variable está sesgada hacia la izquierda, lo que indica que la mayoría de clientes tienen ingresos bajos. La distribuación de los ingresos para TARGET=0 y TARGET=1 es muy similar, La mediana y el rango intercuartílico son también casi iguales, lo que indica que el nivel de ingresos no tiene una relación directa con la posibilidad de impago.

EXT_SOURCE_2 (Puntuaciones de riesgo crediticio para los solicitantes que provienen de fuentes externas) En este caso, presenta pocos valores nulos con respecto al total de los datos, aunque se podrían imputar para no perder demasiada información. En el histograma vemos que la mayoría de los valores están concentrados entre 0.4 y 0.8, es decir, sesgados a la derecha, mientras que los valores extremos son menos comunes.En cuanto a su relación con TARGET, vemos que los valores más altos están relacionados con los clientes que no incumplen. La mediana es mas alta para los clientes que no incumplen, lo que un valor más alto de la variable podría estar asociado con un menor riesgo de incumplimiento. El rango intercuartílico es bastante parecido, aunque se observa que los valores bajos se concentran más en los clientes que presentan incumplimiento, por lo que podemos decir que esta variable puede ser un buen predictor.

EXT_SOURCE_3 (Puntuaciones de riesgo crediticio para los solicitantes que provienen de fuentes externas) Al igual que en las variables anteriores como EXT_SOURCE_1 y EXT_SOURCE_2, prsenta los valores más altos asocidados a clientes que no incumplen el préstamo, lo que indica que valores bajos poseen un mayor riesgo de incumplimiento. Por lo tanto, podemos concluir que la información externa sobre clientes solicitantes de préstamo es una buena variable predictora.

TRATAMIENTO DE VARIABLES CONTINUAS

A continuación se tratarán tres aspectos clave del análisis de los datos: los valores missing, los outliers y las correlaciones entre las variables continuas

In [15]:
list_var_continuous
Out[15]:
['AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_REGISTRATION',
 'OWN_CAR_AGE',
 'CNT_FAM_MEMBERS',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BUILD_MODE',
 'COMMONAREA_MODE',
 'ELEVATORS_MODE',
 'ENTRANCES_MODE',
 'FLOORSMAX_MODE',
 'FLOORSMIN_MODE',
 'LANDAREA_MODE',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAREA_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_MODE',
 'APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'ELEVATORS_MEDI',
 'ENTRANCES_MEDI',
 'FLOORSMAX_MEDI',
 'FLOORSMIN_MEDI',
 'LANDAREA_MEDI',
 'LIVINGAPARTMENTS_MEDI',
 'LIVINGAREA_MEDI',
 'NONLIVINGAPARTMENTS_MEDI',
 'NONLIVINGAREA_MEDI',
 'TOTALAREA_MODE',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR']
In [16]:
def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
    '''
    ----------------------------------------------------------------------------------------------------------
    Función get_deviation_of_mean_perc:
    ----------------------------------------------------------------------------------------------------------
    - Descripción:
        Esta función calcula el porcentaje de valores que se encuentran fuera de un 
        intervalo de confianza, determinado por la media y una desviación estándar 
        multiplicada por un factor (multiplier), para cada variable continua en el 
        DataFrame. Luego, analiza la relación entre estos valores atípicos y la variable 
        objetivo (TARGET), y devuelve un resumen con los porcentajes de valores atípicos 
        y su distribución en relación con la variable objetivo.
        
    - Inputs: 
        - pd_loan (DataFrame): DataFrame que contiene los datos de entrada.
        - list_var_continuous (list): Lista con los nombres de las variables continuas 
          a analizar.
        - target (str): Nombre de la variable objetivo en el DataFrame.
        - multiplier (float): Factor multiplicador para calcular el intervalo de confianza
          (desviación estándar).
        
    - Return:
        - pd_final (DataFrame): DataFrame que contiene el porcentaje de valores atípicos 
          por cada variable continua, su distribución con respecto a la variable objetivo 
          (TARGET), y otros detalles relevantes.
    '''
    
    pd_final = pd.DataFrame()
    
    for i in list_var_continuous:
        
        series_mean = pd_loan[i].mean()
        series_std = pd_loan[i].std()
        std_amp = multiplier * series_std
        left = series_mean - std_amp
        right = series_mean + std_amp
        size_s = pd_loan[i].size
        
        perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
        perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s
        
        if perc_excess>0:    
            pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)]\
                                            .value_counts(normalize=True).reset_index()).T
            pd_concat_percent.columns = [pd_concat_percent.iloc[0,0], 
                                         pd_concat_percent.iloc[0,1]]

            pd_concat_percent['variable'] = i
            pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
            pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
            pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
            
    if pd_final.empty:
        print('No existen variables con valores nulos')
        
    return pd_final
In [17]:
get_deviation_of_mean_perc(pd_loan_train, list_var_continuous, target='TARGET', multiplier=3)
Out[17]:
0.0 1.0 variable sum_outlier_values porcentaje_sum_null_values
0 0.000000 1.000000 AMT_INCOME_TOTAL 215 0.000874
1 0.948837 0.051163 AMT_INCOME_TOTAL 215 0.000874
2 0.000000 1.000000 AMT_CREDIT 2638 0.010723
3 0.958681 0.041319 AMT_CREDIT 2638 0.010723
4 0.000000 1.000000 AMT_ANNUITY 2354 0.009569
5 0.964741 0.035259 AMT_ANNUITY 2354 0.009569
6 0.000000 1.000000 AMT_GOODS_PRICE 3371 0.013703
7 0.961732 0.038268 AMT_GOODS_PRICE 3371 0.013703
8 0.000000 1.000000 REGION_POPULATION_RELATIVE 6686 0.027178
9 0.959168 0.040832 REGION_POPULATION_RELATIVE 6686 0.027178
10 0.000000 1.000000 DAYS_REGISTRATION 605 0.002459
11 0.958678 0.041322 DAYS_REGISTRATION 605 0.002459
12 0.000000 1.000000 OWN_CAR_AGE 2677 0.010882
13 0.917445 0.082555 OWN_CAR_AGE 2677 0.010882
14 0.000000 1.000000 CNT_FAM_MEMBERS 3223 0.013101
15 0.900093 0.099907 CNT_FAM_MEMBERS 3223 0.013101
16 0.000000 1.000000 APARTMENTS_AVG 2387 0.009703
17 0.948890 0.051110 APARTMENTS_AVG 2387 0.009703
18 0.000000 1.000000 BASEMENTAREA_AVG 1588 0.006455
19 0.947733 0.052267 BASEMENTAREA_AVG 1588 0.006455
20 0.000000 1.000000 YEARS_BEGINEXPLUATATION_AVG 552 0.002244
21 0.902174 0.097826 YEARS_BEGINEXPLUATATION_AVG 552 0.002244
22 0.000000 1.000000 YEARS_BUILD_AVG 964 0.003919
23 0.930498 0.069502 YEARS_BUILD_AVG 964 0.003919
24 0.000000 1.000000 COMMONAREA_AVG 1371 0.005573
25 0.951131 0.048869 COMMONAREA_AVG 1371 0.005573
26 0.000000 1.000000 ELEVATORS_AVG 1941 0.007890
27 0.954663 0.045337 ELEVATORS_AVG 1941 0.007890
28 0.000000 1.000000 ENTRANCES_AVG 1759 0.007150
29 0.938033 0.061967 ENTRANCES_AVG 1759 0.007150
30 0.000000 1.000000 FLOORSMAX_AVG 2056 0.008357
31 0.957198 0.042802 FLOORSMAX_AVG 2056 0.008357
32 0.000000 1.000000 FLOORSMIN_AVG 474 0.001927
33 0.955696 0.044304 FLOORSMIN_AVG 474 0.001927
34 0.000000 1.000000 LANDAREA_AVG 1696 0.006894
35 0.938679 0.061321 LANDAREA_AVG 1696 0.006894
36 0.000000 1.000000 LIVINGAPARTMENTS_AVG 1393 0.005662
37 0.952620 0.047380 LIVINGAPARTMENTS_AVG 1393 0.005662
38 0.000000 1.000000 LIVINGAREA_AVG 2568 0.010439
39 0.948988 0.051012 LIVINGAREA_AVG 2568 0.010439
40 0.000000 1.000000 NONLIVINGAPARTMENTS_AVG 586 0.002382
41 0.930034 0.069966 NONLIVINGAPARTMENTS_AVG 586 0.002382
42 0.000000 1.000000 NONLIVINGAREA_AVG 1956 0.007951
43 0.947342 0.052658 NONLIVINGAREA_AVG 1956 0.007951
44 0.000000 1.000000 APARTMENTS_MODE 2411 0.009800
45 0.948569 0.051431 APARTMENTS_MODE 2411 0.009800
46 0.000000 1.000000 BASEMENTAREA_MODE 1689 0.006866
47 0.944938 0.055062 BASEMENTAREA_MODE 1689 0.006866
48 0.000000 1.000000 YEARS_BEGINEXPLUATATION_MODE 540 0.002195
49 0.900000 0.100000 YEARS_BEGINEXPLUATATION_MODE 540 0.002195
50 0.000000 1.000000 YEARS_BUILD_MODE 975 0.003963
51 0.931282 0.068718 YEARS_BUILD_MODE 975 0.003963
52 0.000000 1.000000 COMMONAREA_MODE 1359 0.005524
53 0.946284 0.053716 COMMONAREA_MODE 1359 0.005524
54 0.000000 1.000000 ELEVATORS_MODE 2679 0.010890
55 0.951101 0.048899 ELEVATORS_MODE 2679 0.010890
56 0.000000 1.000000 ENTRANCES_MODE 2094 0.008512
57 0.941261 0.058739 ENTRANCES_MODE 2094 0.008512
58 0.000000 1.000000 FLOORSMAX_MODE 2085 0.008475
59 0.958753 0.041247 FLOORSMAX_MODE 2085 0.008475
60 0.000000 1.000000 FLOORSMIN_MODE 382 0.001553
61 0.958115 0.041885 FLOORSMIN_MODE 382 0.001553
62 0.000000 1.000000 LANDAREA_MODE 1714 0.006967
63 0.937573 0.062427 LANDAREA_MODE 1714 0.006967
64 0.000000 1.000000 LIVINGAPARTMENTS_MODE 1438 0.005845
65 0.949235 0.050765 LIVINGAPARTMENTS_MODE 1438 0.005845
66 0.000000 1.000000 LIVINGAREA_MODE 2687 0.010922
67 0.947897 0.052103 LIVINGAREA_MODE 2687 0.010922
68 0.000000 1.000000 NONLIVINGAPARTMENTS_MODE 546 0.002219
69 0.921245 0.078755 NONLIVINGAPARTMENTS_MODE 546 0.002219
70 0.000000 1.000000 NONLIVINGAREA_MODE 1994 0.008105
71 0.948345 0.051655 NONLIVINGAREA_MODE 1994 0.008105
72 0.000000 1.000000 APARTMENTS_MEDI 2428 0.009870
73 0.948929 0.051071 APARTMENTS_MEDI 2428 0.009870
74 0.000000 1.000000 BASEMENTAREA_MEDI 1621 0.006589
75 0.948180 0.051820 BASEMENTAREA_MEDI 1621 0.006589
76 0.000000 1.000000 YEARS_BEGINEXPLUATATION_MEDI 515 0.002093
77 0.897087 0.102913 YEARS_BEGINEXPLUATATION_MEDI 515 0.002093
78 0.000000 1.000000 YEARS_BUILD_MEDI 976 0.003967
79 0.931352 0.068648 YEARS_BUILD_MEDI 976 0.003967
80 0.000000 1.000000 COMMONAREA_MEDI 1393 0.005662
81 0.949749 0.050251 COMMONAREA_MEDI 1393 0.005662
82 0.000000 1.000000 ELEVATORS_MEDI 1930 0.007845
83 0.954922 0.045078 ELEVATORS_MEDI 1930 0.007845
84 0.000000 1.000000 ENTRANCES_MEDI 1769 0.007191
85 0.937818 0.062182 ENTRANCES_MEDI 1769 0.007191
86 0.000000 1.000000 FLOORSMAX_MEDI 2166 0.008805
87 0.957525 0.042475 FLOORSMAX_MEDI 2166 0.008805
88 0.000000 1.000000 FLOORSMIN_MEDI 443 0.001801
89 0.957111 0.042889 FLOORSMIN_MEDI 443 0.001801
90 0.000000 1.000000 LANDAREA_MEDI 1717 0.006979
91 0.940594 0.059406 LANDAREA_MEDI 1717 0.006979
92 0.000000 1.000000 LIVINGAPARTMENTS_MEDI 1411 0.005736
93 0.951807 0.048193 LIVINGAPARTMENTS_MEDI 1411 0.005736
94 0.000000 1.000000 LIVINGAREA_MEDI 2581 0.010492
95 0.950794 0.049206 LIVINGAREA_MEDI 2581 0.010492
96 0.000000 1.000000 NONLIVINGAPARTMENTS_MEDI 584 0.002374
97 0.928082 0.071918 NONLIVINGAPARTMENTS_MEDI 584 0.002374
98 0.000000 1.000000 NONLIVINGAREA_MEDI 1977 0.008036
99 0.948407 0.051593 NONLIVINGAREA_MEDI 1977 0.008036
100 0.000000 1.000000 TOTALAREA_MODE 2655 0.010792
101 0.958192 0.041808 TOTALAREA_MODE 2655 0.010792
102 0.000000 1.000000 OBS_30_CNT_SOCIAL_CIRCLE 4932 0.020048
103 0.912003 0.087997 OBS_30_CNT_SOCIAL_CIRCLE 4932 0.020048
104 0.000000 1.000000 DEF_30_CNT_SOCIAL_CIRCLE 5516 0.022422
105 0.883249 0.116751 DEF_30_CNT_SOCIAL_CIRCLE 5516 0.022422
106 0.000000 1.000000 OBS_60_CNT_SOCIAL_CIRCLE 4787 0.019459
107 0.912053 0.087947 OBS_60_CNT_SOCIAL_CIRCLE 4787 0.019459
108 0.000000 1.000000 DEF_60_CNT_SOCIAL_CIRCLE 3174 0.012902
109 0.875551 0.124449 DEF_60_CNT_SOCIAL_CIRCLE 3174 0.012902
110 0.000000 1.000000 DAYS_LAST_PHONE_CHANGE 511 0.002077
111 0.958904 0.041096 DAYS_LAST_PHONE_CHANGE 511 0.002077
112 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_HOUR 1277 0.005191
113 0.924041 0.075959 AMT_REQ_CREDIT_BUREAU_HOUR 1277 0.005191
114 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_DAY 1176 0.004780
115 0.909014 0.090986 AMT_REQ_CREDIT_BUREAU_DAY 1176 0.004780
116 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_WEEK 6754 0.027454
117 0.922564 0.077436 AMT_REQ_CREDIT_BUREAU_WEEK 6754 0.027454
118 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_MON 2580 0.010487
119 0.946512 0.053488 AMT_REQ_CREDIT_BUREAU_MON 2580 0.010487
120 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_QRT 1842 0.007488
121 0.915852 0.084148 AMT_REQ_CREDIT_BUREAU_QRT 1842 0.007488
122 0.000000 1.000000 AMT_REQ_CREDIT_BUREAU_YEAR 2713 0.011028
123 0.908220 0.091780 AMT_REQ_CREDIT_BUREAU_YEAR 2713 0.011028

CORRELACIONES

In [18]:
def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
    ''''
    ----------------------------------------------------------------------------------------------------------
    Función get_corr_matrix:
    ----------------------------------------------------------------------------------------------------------
    - Descripción:
        Esta función calcula y visualiza la matriz de correlación entre las variables 
        numéricas de un conjunto de datos. 

    - Inputs: 
        - dataset (DataFrame): Conjunto de datos con las variables numéricas a analizar.
        - metodo (str): Método de correlación a utilizar.
        - size_figure (list): Tamaño de la figura del gráfico.

    - Return:
        - None: Muestra un mapa de calor de la matriz de correlación.
    ----------------------------------------------------------------------------------------------------------
    '''
    
    # Comprobación de que se ha proporcionado el dataset
    if dataset is None:
        print(u'\nHace falta pasar argumentos a la función')
        return 1
    sns.set(style="white")
    
    # Calcular la matriz de correlación
    corr = dataset.corr(method=metodo) 
    
    # Establecer la autocorrelación a cero para evitar distracciones
    for i in range(corr.shape[0]):
        corr.iloc[i, i] = 0
    
    f, ax = plt.subplots(figsize=size_figure)
    
    # Dibujar el mapa de calor con la correlación
    sns.heatmap(corr, center=0,
                square=True, linewidths=.5,  cmap ='viridis' ) #cbar_kws={"shrink": .5}
    plt.show()
    
    return 0
In [19]:
get_corr_matrix(dataset = pd_loan_train[list_var_continuous], 
                metodo='pearson', size_figure=[10,8])
No description has been provided for this image
Out[19]:
0

En la matriz de correlación se observan varias correlaciones de Pearson cercanas o iguales a 1, lo que indica que algunas variables están perfectamente correlacionadas. Esto puede deberse a la existencia de variables redundantes o derivadas de otras, lo que podría generar problemas de multicolinealidad en modelos estadísticos. Es recomendable revisar estas variables y, en caso de ser necesario, eliminar las que sean idénticas o altamente correlacionadas para evitar distorsiones en los resultados del modelo.

In [20]:
corr = pd_loan_train[list_var_continuous].corr('pearson')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1) # below main lower triangle of an array
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.6]
Out[20]:
level_0 level_1 correlation
3694 OBS_60_CNT_SOCIAL_CIRCLE OBS_30_CNT_SOCIAL_CIRCLE 0.998498
2744 YEARS_BUILD_MEDI YEARS_BUILD_AVG 0.998480
3074 FLOORSMIN_MEDI FLOORSMIN_AVG 0.997218
3008 FLOORSMAX_MEDI FLOORSMAX_AVG 0.996977
2942 ENTRANCES_MEDI ENTRANCES_AVG 0.996884
2876 ELEVATORS_MEDI ELEVATORS_AVG 0.996026
2810 COMMONAREA_MEDI COMMONAREA_AVG 0.995566
3272 LIVINGAREA_MEDI LIVINGAREA_AVG 0.995388
2612 BASEMENTAREA_MEDI BASEMENTAREA_AVG 0.994972
2546 APARTMENTS_MEDI APARTMENTS_AVG 0.994890
3206 LIVINGAPARTMENTS_MEDI LIVINGAPARTMENTS_AVG 0.993515
2678 YEARS_BEGINEXPLUATATION_MEDI YEARS_BEGINEXPLUATATION_AVG 0.993382
3140 LANDAREA_MEDI LANDAREA_AVG 0.990689
3404 NONLIVINGAREA_MEDI NONLIVINGAREA_AVG 0.989493
2758 YEARS_BUILD_MEDI YEARS_BUILD_MODE 0.989490
1834 YEARS_BUILD_MODE YEARS_BUILD_AVG 0.989383
3338 NONLIVINGAPARTMENTS_MEDI NONLIVINGAPARTMENTS_AVG 0.988915
3088 FLOORSMIN_MEDI FLOORSMIN_MODE 0.988121
3022 FLOORSMAX_MEDI FLOORSMAX_MODE 0.987859
196 AMT_GOODS_PRICE AMT_CREDIT 0.987028
2164 FLOORSMIN_MODE FLOORSMIN_AVG 0.985501
2098 FLOORSMAX_MODE FLOORSMAX_AVG 0.985307
2890 ELEVATORS_MEDI ELEVATORS_MODE 0.982507
2956 ENTRANCES_MEDI ENTRANCES_MODE 0.980636
3154 LANDAREA_MEDI LANDAREA_MODE 0.980086
2824 COMMONAREA_MEDI COMMONAREA_MODE 0.978585
1966 ELEVATORS_MODE ELEVATORS_AVG 0.978504
2032 ENTRANCES_MODE ENTRANCES_AVG 0.977752
2626 BASEMENTAREA_MEDI BASEMENTAREA_MODE 0.977507
2560 APARTMENTS_MEDI APARTMENTS_MODE 0.976882
3418 NONLIVINGAREA_MEDI NONLIVINGAREA_MODE 0.975851
3352 NONLIVINGAPARTMENTS_MEDI NONLIVINGAPARTMENTS_MODE 0.975829
1900 COMMONAREA_MODE COMMONAREA_AVG 0.975696
3220 LIVINGAPARTMENTS_MEDI LIVINGAPARTMENTS_MODE 0.975206
3286 LIVINGAREA_MEDI LIVINGAREA_MODE 0.974256
1702 BASEMENTAREA_MODE BASEMENTAREA_AVG 0.973761
1636 APARTMENTS_MODE APARTMENTS_AVG 0.972757
2230 LANDAREA_MODE LANDAREA_AVG 0.972270
2362 LIVINGAREA_MODE LIVINGAREA_AVG 0.971495
1768 YEARS_BEGINEXPLUATATION_MODE YEARS_BEGINEXPLUATATION_AVG 0.970595
2296 LIVINGAPARTMENTS_MODE LIVINGAPARTMENTS_AVG 0.969416
2494 NONLIVINGAREA_MODE NONLIVINGAREA_AVG 0.965026
2428 NONLIVINGAPARTMENTS_MODE NONLIVINGAPARTMENTS_AVG 0.964931
2692 YEARS_BEGINEXPLUATATION_MEDI YEARS_BEGINEXPLUATATION_MODE 0.961011
1376 LIVINGAPARTMENTS_AVG APARTMENTS_AVG 0.943873
3224 LIVINGAPARTMENTS_MEDI APARTMENTS_MEDI 0.942700
3196 LIVINGAPARTMENTS_MEDI APARTMENTS_AVG 0.941100
2300 LIVINGAPARTMENTS_MODE APARTMENTS_MODE 0.938834
2556 APARTMENTS_MEDI LIVINGAPARTMENTS_AVG 0.935462
2570 APARTMENTS_MEDI LIVINGAPARTMENTS_MODE 0.932595
2286 LIVINGAPARTMENTS_MODE APARTMENTS_AVG 0.930757
3467 TOTALAREA_MODE LIVINGAREA_AVG 0.925346
3495 TOTALAREA_MODE LIVINGAREA_MEDI 0.919845
3289 LIVINGAREA_MEDI APARTMENTS_MEDI 0.915476
3210 LIVINGAPARTMENTS_MEDI APARTMENTS_MODE 0.914985
1441 LIVINGAREA_AVG APARTMENTS_AVG 0.912775
2557 APARTMENTS_MEDI LIVINGAREA_AVG 0.911891
3261 LIVINGAREA_MEDI APARTMENTS_AVG 0.911658
2365 LIVINGAREA_MODE APARTMENTS_MODE 0.910327
1646 APARTMENTS_MODE LIVINGAPARTMENTS_AVG 0.908376
3481 TOTALAREA_MODE LIVINGAREA_MODE 0.899464
2571 APARTMENTS_MEDI LIVINGAREA_MODE 0.895374
3275 LIVINGAREA_MEDI APARTMENTS_MODE 0.893746
2351 LIVINGAREA_MODE APARTMENTS_AVG 0.892603
3456 TOTALAREA_MODE APARTMENTS_AVG 0.891460
1647 APARTMENTS_MODE LIVINGAREA_AVG 0.890110
3484 TOTALAREA_MODE APARTMENTS_MEDI 0.885698
3299 LIVINGAREA_MEDI LIVINGAPARTMENTS_MEDI 0.885637
3207 LIVINGAPARTMENTS_MEDI LIVINGAREA_AVG 0.883687
1451 LIVINGAREA_AVG LIVINGAPARTMENTS_AVG 0.880839
2375 LIVINGAREA_MODE LIVINGAPARTMENTS_MODE 0.879683
3271 LIVINGAREA_MEDI LIVINGAPARTMENTS_AVG 0.878834
3285 LIVINGAREA_MEDI LIVINGAPARTMENTS_MODE 0.875439
2297 LIVINGAPARTMENTS_MODE LIVINGAREA_AVG 0.873638
3294 LIVINGAREA_MEDI ELEVATORS_MEDI 0.868963
1446 LIVINGAREA_AVG ELEVATORS_AVG 0.868139
2882 ELEVATORS_MEDI LIVINGAREA_AVG 0.866215
3266 LIVINGAREA_MEDI ELEVATORS_AVG 0.866211
3470 TOTALAREA_MODE APARTMENTS_MODE 0.862547
3760 DEF_60_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE 0.861292
3221 LIVINGAPARTMENTS_MEDI LIVINGAREA_MODE 0.858074
2370 LIVINGAREA_MODE ELEVATORS_MODE 0.856832
3280 LIVINGAREA_MEDI ELEVATORS_MODE 0.856177
1972 ELEVATORS_MODE LIVINGAREA_AVG 0.852708
2361 LIVINGAREA_MODE LIVINGAPARTMENTS_AVG 0.852073
3466 TOTALAREA_MODE LIVINGAPARTMENTS_AVG 0.848978
3494 TOTALAREA_MODE LIVINGAPARTMENTS_MEDI 0.847425
3461 TOTALAREA_MODE ELEVATORS_AVG 0.844127
2896 ELEVATORS_MEDI LIVINGAREA_MODE 0.841342
2356 LIVINGAREA_MODE ELEVATORS_AVG 0.839310
3489 TOTALAREA_MODE ELEVATORS_MEDI 0.837782
2899 ELEVATORS_MEDI APARTMENTS_MEDI 0.836951
1051 ELEVATORS_AVG APARTMENTS_AVG 0.836207
3480 TOTALAREA_MODE LIVINGAPARTMENTS_MODE 0.834900
2871 ELEVATORS_MEDI APARTMENTS_AVG 0.834418
2551 APARTMENTS_MEDI ELEVATORS_AVG 0.834012
1975 ELEVATORS_MODE APARTMENTS_MODE 0.825476
2565 APARTMENTS_MEDI ELEVATORS_MODE 0.824589
1961 ELEVATORS_MODE APARTMENTS_AVG 0.821284
3475 TOTALAREA_MODE ELEVATORS_MODE 0.819961
3229 LIVINGAPARTMENTS_MEDI ELEVATORS_MEDI 0.814474
3201 LIVINGAPARTMENTS_MEDI ELEVATORS_AVG 0.812741
1381 LIVINGAPARTMENTS_AVG ELEVATORS_AVG 0.811521
2881 ELEVATORS_MEDI LIVINGAPARTMENTS_AVG 0.809250
2305 LIVINGAPARTMENTS_MODE ELEVATORS_MODE 0.808508
2885 ELEVATORS_MEDI APARTMENTS_MODE 0.807934
1641 APARTMENTS_MODE ELEVATORS_AVG 0.805094
3215 LIVINGAPARTMENTS_MEDI ELEVATORS_MODE 0.799520
2895 ELEVATORS_MEDI LIVINGAPARTMENTS_MODE 0.799019
2291 LIVINGAPARTMENTS_MODE ELEVATORS_AVG 0.797236
1971 ELEVATORS_MODE LIVINGAPARTMENTS_AVG 0.793922
197 AMT_GOODS_PRICE AMT_ANNUITY 0.775646
131 AMT_ANNUITY AMT_CREDIT 0.770755
1253 FLOORSMIN_AVG FLOORSMAX_AVG 0.744612
3101 FLOORSMIN_MEDI FLOORSMAX_MEDI 0.742574
3009 FLOORSMAX_MEDI FLOORSMIN_AVG 0.742076
3073 FLOORSMIN_MEDI FLOORSMAX_AVG 0.742022
3087 FLOORSMIN_MEDI FLOORSMAX_MODE 0.731779
2099 FLOORSMAX_MODE FLOORSMIN_AVG 0.731063
2177 FLOORSMIN_MODE FLOORSMAX_MODE 0.728064
3023 FLOORSMAX_MEDI FLOORSMIN_MODE 0.724399
2163 FLOORSMIN_MODE FLOORSMAX_AVG 0.723555
1442 LIVINGAREA_AVG BASEMENTAREA_AVG 0.693530
2622 BASEMENTAREA_MEDI LIVINGAREA_AVG 0.692890
3290 LIVINGAREA_MEDI BASEMENTAREA_MEDI 0.692285
2366 LIVINGAREA_MODE BASEMENTAREA_MODE 0.691908
3262 LIVINGAREA_MEDI BASEMENTAREA_AVG 0.690430
2636 BASEMENTAREA_MEDI LIVINGAREA_MODE 0.681994
2639 BASEMENTAREA_MEDI APARTMENTS_MEDI 0.680139
791 BASEMENTAREA_AVG APARTMENTS_AVG 0.679079
2352 LIVINGAREA_MODE BASEMENTAREA_AVG 0.679063
1186 FLOORSMAX_AVG ELEVATORS_AVG 0.679062
2611 BASEMENTAREA_MEDI APARTMENTS_AVG 0.678390
1715 BASEMENTAREA_MODE APARTMENTS_MODE 0.678241
2547 APARTMENTS_MEDI BASEMENTAREA_AVG 0.678062
3006 FLOORSMAX_MEDI ELEVATORS_AVG 0.676727
2878 ELEVATORS_MEDI FLOORSMAX_AVG 0.675519
3034 FLOORSMAX_MEDI ELEVATORS_MEDI 0.675033
1712 BASEMENTAREA_MODE LIVINGAREA_AVG 0.674008
3457 TOTALAREA_MODE BASEMENTAREA_AVG 0.674000
3276 LIVINGAREA_MEDI BASEMENTAREA_MODE 0.673957
3485 TOTALAREA_MODE BASEMENTAREA_MEDI 0.670770
2096 FLOORSMAX_MODE ELEVATORS_AVG 0.669606
2625 BASEMENTAREA_MEDI APARTMENTS_MODE 0.668767
2892 ELEVATORS_MEDI FLOORSMAX_MODE 0.668033
1637 APARTMENTS_MODE BASEMENTAREA_AVG 0.666231
2561 APARTMENTS_MEDI BASEMENTAREA_MODE 0.662903
1701 BASEMENTAREA_MODE APARTMENTS_AVG 0.660205
2110 FLOORSMAX_MODE ELEVATORS_MODE 0.659724
2041 ENTRANCES_MODE BASEMENTAREA_MODE 0.655824
1707 BASEMENTAREA_MODE ENTRANCES_AVG 0.655192
1968 ELEVATORS_MODE FLOORSMAX_AVG 0.655152
2617 BASEMENTAREA_MEDI ENTRANCES_AVG 0.654544
3020 FLOORSMAX_MEDI ELEVATORS_MODE 0.654446
2301 LIVINGAPARTMENTS_MODE BASEMENTAREA_MODE 0.654209
2951 ENTRANCES_MEDI BASEMENTAREA_MODE 0.654198
2965 ENTRANCES_MEDI BASEMENTAREA_MEDI 0.653110
1117 ENTRANCES_AVG BASEMENTAREA_AVG 0.653068
2635 BASEMENTAREA_MEDI LIVINGAPARTMENTS_MODE 0.652382
3225 LIVINGAPARTMENTS_MEDI BASEMENTAREA_MEDI 0.652122
3471 TOTALAREA_MODE BASEMENTAREA_MODE 0.650148
3197 LIVINGAPARTMENTS_MEDI BASEMENTAREA_AVG 0.649861
2937 ENTRANCES_MEDI BASEMENTAREA_AVG 0.648946
2287 LIVINGAPARTMENTS_MODE BASEMENTAREA_AVG 0.648856
1377 LIVINGAPARTMENTS_AVG BASEMENTAREA_AVG 0.647472
2621 BASEMENTAREA_MEDI LIVINGAPARTMENTS_AVG 0.647065
2631 BASEMENTAREA_MEDI ENTRANCES_MODE 0.632527
3463 TOTALAREA_MODE FLOORSMAX_AVG 0.632427
1448 LIVINGAREA_AVG FLOORSMAX_AVG 0.630578
3491 TOTALAREA_MODE FLOORSMAX_MEDI 0.629713
3211 LIVINGAPARTMENTS_MEDI BASEMENTAREA_MODE 0.629397
3012 FLOORSMAX_MEDI LIVINGAREA_AVG 0.628444
2027 ENTRANCES_MODE BASEMENTAREA_AVG 0.627951
3268 LIVINGAREA_MEDI FLOORSMAX_AVG 0.627410
3296 LIVINGAREA_MEDI FLOORSMAX_MEDI 0.626521
2102 FLOORSMAX_MODE LIVINGAREA_AVG 0.625773
3477 TOTALAREA_MODE FLOORSMAX_MODE 0.625002
1711 BASEMENTAREA_MODE LIVINGAPARTMENTS_AVG 0.624275
3282 LIVINGAREA_MEDI FLOORSMAX_MODE 0.624138
2371 LIVINGAREA_MODE ENTRANCES_MODE 0.623087
2357 LIVINGAREA_MODE ENTRANCES_AVG 0.623077
2961 ENTRANCES_MEDI LIVINGAREA_MODE 0.622534
3267 LIVINGAREA_MEDI ENTRANCES_AVG 0.619274
3295 LIVINGAREA_MEDI ENTRANCES_MEDI 0.618969
1447 LIVINGAREA_AVG ENTRANCES_AVG 0.618727
1181 FLOORSMAX_AVG APARTMENTS_AVG 0.618204
3001 FLOORSMAX_MEDI APARTMENTS_AVG 0.615957
2947 ENTRANCES_MEDI LIVINGAREA_AVG 0.614989
2040 ENTRANCES_MODE APARTMENTS_MODE 0.614956
2553 APARTMENTS_MEDI FLOORSMAX_AVG 0.614953
3029 FLOORSMAX_MEDI APARTMENTS_MEDI 0.614018
2091 FLOORSMAX_MODE APARTMENTS_AVG 0.613594
2950 ENTRANCES_MEDI APARTMENTS_MODE 0.612075
2567 APARTMENTS_MEDI FLOORSMAX_MODE 0.611887
1642 APARTMENTS_MODE ENTRANCES_AVG 0.611844
2552 APARTMENTS_MEDI ENTRANCES_AVG 0.611029
2964 ENTRANCES_MEDI APARTMENTS_MEDI 0.610880
1116 ENTRANCES_AVG APARTMENTS_AVG 0.610671
2936 ENTRANCES_MEDI APARTMENTS_AVG 0.607111
2372 LIVINGAREA_MODE FLOORSMAX_MODE 0.605688
In [21]:
get_corr_matrix(dataset = pd_loan_train[list_var_continuous], 
                metodo='spearman', size_figure=[10,8])
No description has been provided for this image
Out[21]:
0

También hemos hecho la matriz de spearman. Ambas matrices proporcionan resultados muy parecidos, por lo que llegamos a la conclusión de que es necesario realizar una simplificación de las variables para evitar problemas en el modelado, ya que muchas de nuestras variables no nos están aportando un valor añadido y nos hace el trabajo más complejo.

In [22]:
corr = pd_loan_train[list_var_continuous].corr('spearman')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1) 
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.6]
Out[22]:
level_0 level_1 correlation
2744 YEARS_BUILD_MEDI YEARS_BUILD_AVG 0.998428
2678 YEARS_BEGINEXPLUATATION_MEDI YEARS_BEGINEXPLUATATION_AVG 0.997466
3694 OBS_60_CNT_SOCIAL_CIRCLE OBS_30_CNT_SOCIAL_CIRCLE 0.997344
3074 FLOORSMIN_MEDI FLOORSMIN_AVG 0.996217
3206 LIVINGAPARTMENTS_MEDI LIVINGAPARTMENTS_AVG 0.996197
3140 LANDAREA_MEDI LANDAREA_AVG 0.996097
2810 COMMONAREA_MEDI COMMONAREA_AVG 0.995745
3272 LIVINGAREA_MEDI LIVINGAREA_AVG 0.995526
2546 APARTMENTS_MEDI APARTMENTS_AVG 0.995280
3008 FLOORSMAX_MEDI FLOORSMAX_AVG 0.994909
2612 BASEMENTAREA_MEDI BASEMENTAREA_AVG 0.994903
2942 ENTRANCES_MEDI ENTRANCES_AVG 0.993446
2876 ELEVATORS_MEDI ELEVATORS_AVG 0.990981
2758 YEARS_BUILD_MEDI YEARS_BUILD_MODE 0.988085
1834 YEARS_BUILD_MODE YEARS_BUILD_AVG 0.987839
923 YEARS_BUILD_AVG YEARS_BEGINEXPLUATATION_AVG 0.987054
3022 FLOORSMAX_MEDI FLOORSMAX_MODE 0.986370
2771 YEARS_BUILD_MEDI YEARS_BEGINEXPLUATATION_MEDI 0.985982
3088 FLOORSMIN_MEDI FLOORSMIN_MODE 0.985881
2743 YEARS_BUILD_MEDI YEARS_BEGINEXPLUATATION_AVG 0.985650
2692 YEARS_BEGINEXPLUATATION_MEDI YEARS_BEGINEXPLUATATION_MODE 0.985505
1768 YEARS_BEGINEXPLUATATION_MODE YEARS_BEGINEXPLUATATION_AVG 0.985173
196 AMT_GOODS_PRICE AMT_CREDIT 0.984943
2679 YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_AVG 0.984562
2164 FLOORSMIN_MODE FLOORSMIN_AVG 0.982143
3338 NONLIVINGAPARTMENTS_MEDI NONLIVINGAPARTMENTS_AVG 0.981957
2098 FLOORSMAX_MODE FLOORSMAX_AVG 0.981609
3404 NONLIVINGAREA_MEDI NONLIVINGAREA_AVG 0.981065
1847 YEARS_BUILD_MODE YEARS_BEGINEXPLUATATION_MODE 0.980198
2890 ELEVATORS_MEDI ELEVATORS_MODE 0.977569
1833 YEARS_BUILD_MODE YEARS_BEGINEXPLUATATION_AVG 0.975319
2693 YEARS_BEGINEXPLUATATION_MEDI YEARS_BUILD_MODE 0.974900
1376 LIVINGAPARTMENTS_AVG APARTMENTS_AVG 0.973931
3220 LIVINGAPARTMENTS_MEDI LIVINGAPARTMENTS_MODE 0.973522
3224 LIVINGAPARTMENTS_MEDI APARTMENTS_MEDI 0.971352
3154 LANDAREA_MEDI LANDAREA_MODE 0.970759
3196 LIVINGAPARTMENTS_MEDI APARTMENTS_AVG 0.970180
2824 COMMONAREA_MEDI COMMONAREA_MODE 0.970113
2560 APARTMENTS_MEDI APARTMENTS_MODE 0.970000
2296 LIVINGAPARTMENTS_MODE LIVINGAPARTMENTS_AVG 0.969626
2757 YEARS_BUILD_MEDI YEARS_BEGINEXPLUATATION_MODE 0.969480
1769 YEARS_BEGINEXPLUATATION_MODE YEARS_BUILD_AVG 0.969186
3286 LIVINGAREA_MEDI LIVINGAREA_MODE 0.969180
1966 ELEVATORS_MODE ELEVATORS_AVG 0.968642
2556 APARTMENTS_MEDI LIVINGAPARTMENTS_AVG 0.968240
2230 LANDAREA_MODE LANDAREA_AVG 0.967052
1636 APARTMENTS_MODE APARTMENTS_AVG 0.964885
2956 ENTRANCES_MEDI ENTRANCES_MODE 0.964759
1900 COMMONAREA_MODE COMMONAREA_AVG 0.964737
2362 LIVINGAREA_MODE LIVINGAREA_AVG 0.964263
2626 BASEMENTAREA_MEDI BASEMENTAREA_MODE 0.963546
1702 BASEMENTAREA_MODE BASEMENTAREA_AVG 0.960061
2032 ENTRANCES_MODE ENTRANCES_AVG 0.957524
2300 LIVINGAPARTMENTS_MODE APARTMENTS_MODE 0.956511
3352 NONLIVINGAPARTMENTS_MEDI NONLIVINGAPARTMENTS_MODE 0.950325
2570 APARTMENTS_MEDI LIVINGAPARTMENTS_MODE 0.947595
2286 LIVINGAPARTMENTS_MODE APARTMENTS_AVG 0.944239
3418 NONLIVINGAREA_MEDI NONLIVINGAREA_MODE 0.941355
3467 TOTALAREA_MODE LIVINGAREA_AVG 0.939504
3495 TOTALAREA_MODE LIVINGAREA_MEDI 0.935012
3210 LIVINGAPARTMENTS_MEDI APARTMENTS_MODE 0.933764
2428 NONLIVINGAPARTMENTS_MODE NONLIVINGAPARTMENTS_AVG 0.930926
1646 APARTMENTS_MODE LIVINGAPARTMENTS_AVG 0.929945
2494 NONLIVINGAREA_MODE NONLIVINGAREA_AVG 0.918045
3481 TOTALAREA_MODE LIVINGAREA_MODE 0.917397
1441 LIVINGAREA_AVG APARTMENTS_AVG 0.905822
3289 LIVINGAREA_MEDI APARTMENTS_MEDI 0.904069
3261 LIVINGAREA_MEDI APARTMENTS_AVG 0.901992
2557 APARTMENTS_MEDI LIVINGAREA_AVG 0.901102
3456 TOTALAREA_MODE APARTMENTS_AVG 0.899416
1451 LIVINGAREA_AVG LIVINGAPARTMENTS_AVG 0.896571
3484 TOTALAREA_MODE APARTMENTS_MEDI 0.894394
3299 LIVINGAREA_MEDI LIVINGAPARTMENTS_MEDI 0.893339
2365 LIVINGAREA_MODE APARTMENTS_MODE 0.893206
3207 LIVINGAPARTMENTS_MEDI LIVINGAREA_AVG 0.892882
3271 LIVINGAREA_MEDI LIVINGAPARTMENTS_AVG 0.891525
2571 APARTMENTS_MEDI LIVINGAREA_MODE 0.877484
3466 TOTALAREA_MODE LIVINGAPARTMENTS_AVG 0.875855
3470 TOTALAREA_MODE APARTMENTS_MODE 0.875727
2375 LIVINGAREA_MODE LIVINGAPARTMENTS_MODE 0.875044
3275 LIVINGAREA_MEDI APARTMENTS_MODE 0.874660
2351 LIVINGAREA_MODE APARTMENTS_AVG 0.873569
3494 TOTALAREA_MODE LIVINGAPARTMENTS_MEDI 0.871975
1647 APARTMENTS_MODE LIVINGAREA_AVG 0.870164
3285 LIVINGAREA_MEDI LIVINGAPARTMENTS_MODE 0.869272
2297 LIVINGAPARTMENTS_MODE LIVINGAREA_AVG 0.866421
3221 LIVINGAPARTMENTS_MEDI LIVINGAREA_MODE 0.858586
2361 LIVINGAREA_MODE LIVINGAPARTMENTS_AVG 0.855480
3480 TOTALAREA_MODE LIVINGAPARTMENTS_MODE 0.853377
1186 FLOORSMAX_AVG ELEVATORS_AVG 0.849617
3034 FLOORSMAX_MEDI ELEVATORS_MEDI 0.845770
2878 ELEVATORS_MEDI FLOORSMAX_AVG 0.844668
3760 DEF_60_CNT_SOCIAL_CIRCLE DEF_30_CNT_SOCIAL_CIRCLE 0.844608
3006 FLOORSMAX_MEDI ELEVATORS_AVG 0.842088
2110 FLOORSMAX_MODE ELEVATORS_MODE 0.833181
3020 FLOORSMAX_MEDI ELEVATORS_MODE 0.830540
131 AMT_ANNUITY AMT_CREDIT 0.830277
1968 ELEVATORS_MODE FLOORSMAX_AVG 0.829479
197 AMT_GOODS_PRICE AMT_ANNUITY 0.828031
2892 ELEVATORS_MEDI FLOORSMAX_MODE 0.825961
2096 FLOORSMAX_MODE ELEVATORS_AVG 0.822037
1448 LIVINGAREA_AVG FLOORSMAX_AVG 0.782872
3012 FLOORSMAX_MEDI LIVINGAREA_AVG 0.781536
3296 LIVINGAREA_MEDI FLOORSMAX_MEDI 0.779596
3268 LIVINGAREA_MEDI FLOORSMAX_AVG 0.777663
3463 TOTALAREA_MODE FLOORSMAX_AVG 0.776928
2102 FLOORSMAX_MODE LIVINGAREA_AVG 0.776025
3491 TOTALAREA_MODE FLOORSMAX_MEDI 0.775981
3477 TOTALAREA_MODE FLOORSMAX_MODE 0.774683
3282 LIVINGAREA_MEDI FLOORSMAX_MODE 0.774643
2372 LIVINGAREA_MODE FLOORSMAX_MODE 0.760557
1181 FLOORSMAX_AVG APARTMENTS_AVG 0.756137
3001 FLOORSMAX_MEDI APARTMENTS_AVG 0.754862
3029 FLOORSMAX_MEDI APARTMENTS_MEDI 0.752127
2091 FLOORSMAX_MODE APARTMENTS_AVG 0.750924
2553 APARTMENTS_MEDI FLOORSMAX_AVG 0.750381
2567 APARTMENTS_MEDI FLOORSMAX_MODE 0.748658
3026 FLOORSMAX_MEDI LIVINGAREA_MODE 0.747296
2358 LIVINGAREA_MODE FLOORSMAX_AVG 0.744226
2105 FLOORSMAX_MODE APARTMENTS_MODE 0.734853
1446 LIVINGAREA_AVG ELEVATORS_AVG 0.728249
1383 LIVINGAPARTMENTS_AVG FLOORSMAX_AVG 0.725546
2882 ELEVATORS_MEDI LIVINGAREA_AVG 0.723934
3011 FLOORSMAX_MEDI LIVINGAPARTMENTS_AVG 0.723254
3231 LIVINGAPARTMENTS_MEDI FLOORSMAX_MEDI 0.721116
3294 LIVINGAREA_MEDI ELEVATORS_MEDI 0.720893
3203 LIVINGAPARTMENTS_MEDI FLOORSMAX_AVG 0.720772
1442 LIVINGAREA_AVG BASEMENTAREA_AVG 0.720061
3015 FLOORSMAX_MEDI APARTMENTS_MODE 0.719711
3266 LIVINGAREA_MEDI ELEVATORS_AVG 0.719150
2101 FLOORSMAX_MODE LIVINGAPARTMENTS_AVG 0.717936
1643 APARTMENTS_MODE FLOORSMAX_AVG 0.717124
3262 LIVINGAREA_MEDI BASEMENTAREA_AVG 0.717041
3461 TOTALAREA_MODE ELEVATORS_AVG 0.716815
3217 LIVINGAPARTMENTS_MEDI FLOORSMAX_MODE 0.716248
3290 LIVINGAREA_MEDI BASEMENTAREA_MEDI 0.715917
2622 BASEMENTAREA_MEDI LIVINGAREA_AVG 0.714122
3489 TOTALAREA_MODE ELEVATORS_MEDI 0.714046
3457 TOTALAREA_MODE BASEMENTAREA_AVG 0.712283
1972 ELEVATORS_MODE LIVINGAREA_AVG 0.709392
3280 LIVINGAREA_MEDI ELEVATORS_MODE 0.708131
3485 TOTALAREA_MODE BASEMENTAREA_MEDI 0.706017
3475 TOTALAREA_MODE ELEVATORS_MODE 0.705510
2366 LIVINGAREA_MODE BASEMENTAREA_MODE 0.703925
2307 LIVINGAPARTMENTS_MODE FLOORSMAX_MODE 0.702851
2507 NONLIVINGAREA_MODE NONLIVINGAPARTMENTS_MODE 0.699992
2636 BASEMENTAREA_MEDI LIVINGAREA_MODE 0.699436
791 BASEMENTAREA_AVG APARTMENTS_AVG 0.699264
2352 LIVINGAREA_MODE BASEMENTAREA_AVG 0.698285
2547 APARTMENTS_MEDI BASEMENTAREA_AVG 0.696681
1051 ELEVATORS_AVG APARTMENTS_AVG 0.696164
2639 BASEMENTAREA_MEDI APARTMENTS_MEDI 0.696086
2611 BASEMENTAREA_MEDI APARTMENTS_AVG 0.693750
1377 LIVINGAPARTMENTS_AVG BASEMENTAREA_AVG 0.693211
3025 FLOORSMAX_MEDI LIVINGAPARTMENTS_MODE 0.692947
2871 ELEVATORS_MEDI APARTMENTS_AVG 0.692913
1381 LIVINGAPARTMENTS_AVG ELEVATORS_AVG 0.692051
3197 LIVINGAPARTMENTS_MEDI BASEMENTAREA_AVG 0.691784
2293 LIVINGAPARTMENTS_MODE FLOORSMAX_AVG 0.690693
3431 NONLIVINGAREA_MEDI NONLIVINGAPARTMENTS_MEDI 0.690464
3225 LIVINGAPARTMENTS_MEDI BASEMENTAREA_MEDI 0.690001
2899 ELEVATORS_MEDI APARTMENTS_MEDI 0.689852
2370 LIVINGAREA_MODE ELEVATORS_MODE 0.688240
2881 ELEVATORS_MEDI LIVINGAPARTMENTS_AVG 0.688170
1583 NONLIVINGAREA_AVG NONLIVINGAPARTMENTS_AVG 0.687643
2621 BASEMENTAREA_MEDI LIVINGAPARTMENTS_AVG 0.687392
2551 APARTMENTS_MEDI ELEVATORS_AVG 0.687294
3229 LIVINGAPARTMENTS_MEDI ELEVATORS_MEDI 0.686716
3201 LIVINGAPARTMENTS_MEDI ELEVATORS_AVG 0.685277
1715 BASEMENTAREA_MODE APARTMENTS_MODE 0.683802
1961 ELEVATORS_MODE APARTMENTS_AVG 0.680626
3403 NONLIVINGAREA_MEDI NONLIVINGAPARTMENTS_AVG 0.679343
2565 APARTMENTS_MEDI ELEVATORS_MODE 0.679235
2625 BASEMENTAREA_MEDI APARTMENTS_MODE 0.678549
2287 LIVINGAPARTMENTS_MODE BASEMENTAREA_AVG 0.678028
2635 BASEMENTAREA_MEDI LIVINGAPARTMENTS_MODE 0.677939
1637 APARTMENTS_MODE BASEMENTAREA_AVG 0.677427
3276 LIVINGAREA_MEDI BASEMENTAREA_MODE 0.677232
1971 ELEVATORS_MODE LIVINGAPARTMENTS_AVG 0.674387
2896 ELEVATORS_MEDI LIVINGAREA_MODE 0.674219
3339 NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_AVG 0.673805
3215 LIVINGAPARTMENTS_MEDI ELEVATORS_MODE 0.673757
2301 LIVINGAPARTMENTS_MODE BASEMENTAREA_MODE 0.673746
3471 TOTALAREA_MODE BASEMENTAREA_MODE 0.673084
1712 BASEMENTAREA_MODE LIVINGAREA_AVG 0.671991
2356 LIVINGAREA_MODE ELEVATORS_AVG 0.667969
1253 FLOORSMIN_AVG FLOORSMAX_AVG 0.667848
3009 FLOORSMAX_MEDI FLOORSMIN_AVG 0.664254
3101 FLOORSMIN_MEDI FLOORSMAX_MEDI 0.663769
3073 FLOORSMIN_MEDI FLOORSMAX_AVG 0.662099
1975 ELEVATORS_MODE APARTMENTS_MODE 0.661488
3353 NONLIVINGAPARTMENTS_MEDI NONLIVINGAREA_MODE 0.661417
2561 APARTMENTS_MEDI BASEMENTAREA_MODE 0.660128
1117 ENTRANCES_AVG BASEMENTAREA_AVG 0.659801
2305 LIVINGAPARTMENTS_MODE ELEVATORS_MODE 0.659662
2617 BASEMENTAREA_MEDI ENTRANCES_AVG 0.659516
3417 NONLIVINGAREA_MEDI NONLIVINGAPARTMENTS_MODE 0.658267
2965 ENTRANCES_MEDI BASEMENTAREA_MEDI 0.658100
1701 BASEMENTAREA_MODE APARTMENTS_AVG 0.654398
2099 FLOORSMAX_MODE FLOORSMIN_AVG 0.653345
3087 FLOORSMIN_MEDI FLOORSMAX_MODE 0.653094
2937 ENTRANCES_MEDI BASEMENTAREA_AVG 0.652333
2895 ELEVATORS_MEDI LIVINGAPARTMENTS_MODE 0.649293
3211 LIVINGAPARTMENTS_MEDI BASEMENTAREA_MODE 0.648081
2041 ENTRANCES_MODE BASEMENTAREA_MODE 0.645885
2177 FLOORSMIN_MODE FLOORSMAX_MODE 0.645556
2291 LIVINGAPARTMENTS_MODE ELEVATORS_AVG 0.644567
2493 NONLIVINGAREA_MODE NONLIVINGAPARTMENTS_AVG 0.644226
2885 ELEVATORS_MEDI APARTMENTS_MODE 0.643921
1711 BASEMENTAREA_MODE LIVINGAPARTMENTS_AVG 0.643555
2951 ENTRANCES_MEDI BASEMENTAREA_MODE 0.641848
1707 BASEMENTAREA_MODE ENTRANCES_AVG 0.641251
3023 FLOORSMAX_MEDI FLOORSMIN_MODE 0.640916
1641 APARTMENTS_MODE ELEVATORS_AVG 0.638622
2163 FLOORSMIN_MODE FLOORSMAX_AVG 0.638591
2429 NONLIVINGAPARTMENTS_MODE NONLIVINGAREA_AVG 0.637092
3460 TOTALAREA_MODE COMMONAREA_AVG 0.623052
2631 BASEMENTAREA_MEDI ENTRANCES_MODE 0.621345
3488 TOTALAREA_MODE COMMONAREA_MEDI 0.618942
2027 ENTRANCES_MODE BASEMENTAREA_AVG 0.614406
1447 LIVINGAREA_AVG ENTRANCES_AVG 0.608158
3267 LIVINGAREA_MEDI ENTRANCES_AVG 0.607489
3295 LIVINGAREA_MEDI ENTRANCES_MEDI 0.604612
2357 LIVINGAREA_MODE ENTRANCES_AVG 0.601575
2961 ENTRANCES_MEDI LIVINGAREA_MODE 0.600879

TRATAMIENTO DE LOS VALORES NULOS

In [23]:
def get_percent_null_values_target(pd_loan, list_var_continuous, target):

    pd_final = pd.DataFrame()
    for i in list_var_continuous:
        if pd_loan[i].isnull().sum() > 0:
            pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i].isnull()]
                                             .value_counts(normalize=True).reset_index()).T

            # Verifica cuántas columnas tiene pd_concat_percent antes de asignarlas
            if pd_concat_percent.shape[1] >= 2:
                pd_concat_percent.columns = [pd_concat_percent.iloc[0, 0],
                                             pd_concat_percent.iloc[0, 1]]
            elif pd_concat_percent.shape[1] == 1:
                pd_concat_percent.columns = [pd_concat_percent.iloc[0, 0]]

            pd_concat_percent = pd_concat_percent.drop(target, axis=0)
            pd_concat_percent['variable'] = i
            pd_concat_percent['sum_null_values'] = pd_loan[i].isnull().sum()
            pd_concat_percent['porcentaje_sum_null_values'] = pd_loan[i].isnull().sum() / pd_loan.shape[0]
            pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)

    if pd_final.empty:
        print('No existen variables con valores nulos')

    return pd_final
In [24]:
get_percent_null_values_target(pd_loan_train, list_var_continuous, target='TARGET')
Out[24]:
0.0 variable sum_null_values porcentaje_sum_null_values 1.0
0 1.000000 AMT_ANNUITY 6 0.000024 NaN
1 0.934498 AMT_GOODS_PRICE 229 0.000931 0.065502
2 0.915038 OWN_CAR_AGE 162531 0.660674 0.084962
3 1.000000 CNT_FAM_MEMBERS 2 0.000008 NaN
4 0.914876 EXT_SOURCE_1 138763 0.564059 0.085124
5 0.922652 EXT_SOURCE_2 543 0.002207 0.077348
6 0.906754 EXT_SOURCE_3 48828 0.198481 0.093246
7 0.908370 APARTMENTS_AVG 124643 0.506662 0.091630
8 0.910746 BASEMENTAREA_AVG 143736 0.584274 0.089254
9 0.907886 YEARS_BEGINEXPLUATATION_AVG 119787 0.486923 0.092114
10 0.913042 YEARS_BUILD_AVG 163539 0.664771 0.086958
11 0.914184 COMMONAREA_AVG 171857 0.698583 0.085816
12 0.908915 ELEVATORS_AVG 130922 0.532186 0.091085
13 0.908148 ENTRANCES_AVG 123633 0.502557 0.091852
14 0.908045 FLOORSMAX_AVG 122179 0.496646 0.091955
15 0.913584 FLOORSMIN_AVG 166856 0.678254 0.086416
16 0.911689 LANDAREA_AVG 145825 0.592765 0.088311
17 0.913681 LIVINGAPARTMENTS_AVG 168063 0.683161 0.086319
18 0.908475 LIVINGAREA_AVG 123256 0.501024 0.091525
19 0.914042 NONLIVINGAPARTMENTS_AVG 170757 0.694112 0.085958
20 0.909545 NONLIVINGAREA_AVG 135493 0.550767 0.090455
21 0.908370 APARTMENTS_MODE 124643 0.506662 0.091630
22 0.910746 BASEMENTAREA_MODE 143736 0.584274 0.089254
23 0.907886 YEARS_BEGINEXPLUATATION_MODE 119787 0.486923 0.092114
24 0.913042 YEARS_BUILD_MODE 163539 0.664771 0.086958
25 0.914184 COMMONAREA_MODE 171857 0.698583 0.085816
26 0.908915 ELEVATORS_MODE 130922 0.532186 0.091085
27 0.908148 ENTRANCES_MODE 123633 0.502557 0.091852
28 0.908045 FLOORSMAX_MODE 122179 0.496646 0.091955
29 0.913584 FLOORSMIN_MODE 166856 0.678254 0.086416
30 0.911689 LANDAREA_MODE 145825 0.592765 0.088311
31 0.913681 LIVINGAPARTMENTS_MODE 168063 0.683161 0.086319
32 0.908475 LIVINGAREA_MODE 123256 0.501024 0.091525
33 0.914042 NONLIVINGAPARTMENTS_MODE 170757 0.694112 0.085958
34 0.909545 NONLIVINGAREA_MODE 135493 0.550767 0.090455
35 0.908370 APARTMENTS_MEDI 124643 0.506662 0.091630
36 0.910746 BASEMENTAREA_MEDI 143736 0.584274 0.089254
37 0.907886 YEARS_BEGINEXPLUATATION_MEDI 119787 0.486923 0.092114
38 0.913042 YEARS_BUILD_MEDI 163539 0.664771 0.086958
39 0.914184 COMMONAREA_MEDI 171857 0.698583 0.085816
40 0.908915 ELEVATORS_MEDI 130922 0.532186 0.091085
41 0.908148 ENTRANCES_MEDI 123633 0.502557 0.091852
42 0.908045 FLOORSMAX_MEDI 122179 0.496646 0.091955
43 0.913584 FLOORSMIN_MEDI 166856 0.678254 0.086416
44 0.911689 LANDAREA_MEDI 145825 0.592765 0.088311
45 0.913681 LIVINGAPARTMENTS_MEDI 168063 0.683161 0.086319
46 0.908475 LIVINGAREA_MEDI 123256 0.501024 0.091525
47 0.914042 NONLIVINGAPARTMENTS_MEDI 170757 0.694112 0.085958
48 0.909545 NONLIVINGAREA_MEDI 135493 0.550767 0.090455
49 0.907624 TOTALAREA_MODE 118526 0.481797 0.092376
50 0.963592 OBS_30_CNT_SOCIAL_CIRCLE 824 0.003349 0.036408
51 0.963592 DEF_30_CNT_SOCIAL_CIRCLE 824 0.003349 0.036408
52 0.963592 OBS_60_CNT_SOCIAL_CIRCLE 824 0.003349 0.036408
53 0.963592 DEF_60_CNT_SOCIAL_CIRCLE 824 0.003349 0.036408
54 0.896084 AMT_REQ_CREDIT_BUREAU_HOUR 33248 0.135150 0.103916
55 0.896084 AMT_REQ_CREDIT_BUREAU_DAY 33248 0.135150 0.103916
56 0.896084 AMT_REQ_CREDIT_BUREAU_WEEK 33248 0.135150 0.103916
57 0.896084 AMT_REQ_CREDIT_BUREAU_MON 33248 0.135150 0.103916
58 0.896084 AMT_REQ_CREDIT_BUREAU_QRT 33248 0.135150 0.103916
59 0.896084 AMT_REQ_CREDIT_BUREAU_YEAR 33248 0.135150 0.103916

Hemos decidido que nuestro umbral de valores nulos se situará en el 50%, es decir, sólo usaremos aquellas variables que cuenten con menos de un 50% de valores nulos. Haciendo este filtrado, creamos un nuevo df llamado pd_loan_train_filtered.

In [25]:
threshold = 0.5
list_vars_not_null = list(pd_null_columnas[pd_null_columnas['porcentaje_columnas'] < threshold].index)
pd_loan_train_filtered = pd_loan_train.loc[:, list_vars_not_null]

print(pd_loan_train_filtered)
            FLOORSMAX_MEDI  FLOORSMAX_AVG  FLOORSMAX_MODE  \
SK_ID_CURR                                                  
429844                 NaN            NaN             NaN   
259562                 NaN            NaN             NaN   
188975                 NaN            NaN             NaN   
254199              0.7083         0.6917          0.6250   
332686              0.3333         0.3333          0.3333   
...                    ...            ...             ...   
191213              0.3333         0.3333          0.3333   
383329                 NaN            NaN             NaN   
177037                 NaN            NaN             NaN   
358151              0.4583         0.4583          0.4583   
277125              0.3333         0.3333          0.3333   

            YEARS_BEGINEXPLUATATION_MEDI  YEARS_BEGINEXPLUATATION_MODE  \
SK_ID_CURR                                                               
429844                               NaN                           NaN   
259562                               NaN                           NaN   
188975                               NaN                           NaN   
254199                            0.9990                        0.9985   
332686                            0.9861                        0.9861   
...                                  ...                           ...   
191213                            0.9891                        0.9891   
383329                               NaN                           NaN   
177037                               NaN                           NaN   
358151                            0.9896                        0.9896   
277125                            0.9861                        0.9861   

            YEARS_BEGINEXPLUATATION_AVG  TOTALAREA_MODE EMERGENCYSTATE_MODE  \
SK_ID_CURR                                                                    
429844                              NaN             NaN                 NaN   
259562                              NaN             NaN                 NaN   
188975                              NaN             NaN                 NaN   
254199                           0.9985          0.1849                  No   
332686                           0.9861          0.2577                  No   
...                                 ...             ...                 ...   
191213                           0.9891          0.0988                  No   
383329                              NaN             NaN                 NaN   
177037                              NaN             NaN                 NaN   
358151                           0.9896          0.0880                  No   
277125                           0.9861          0.1811                  No   

                  OCCUPATION_TYPE  EXT_SOURCE_3  AMT_REQ_CREDIT_BUREAU_MON  \
SK_ID_CURR                                                                   
429844                   Laborers      0.665855                        0.0   
259562                Sales staff      0.808394                        0.0   
188975                        NaN           NaN                        0.0   
254199      High skill tech staff      0.519097                        1.0   
332686                 Core staff      0.537070                        1.0   
...                           ...           ...                        ...   
191213                   Laborers      0.837747                        0.0   
383329                   Managers      0.358951                        1.0   
177037                        NaN      0.843544                        0.0   
358151              Cooking staff      0.626304                        0.0   
277125             Medicine staff      0.360613                        0.0   

            AMT_REQ_CREDIT_BUREAU_HOUR  AMT_REQ_CREDIT_BUREAU_YEAR  \
SK_ID_CURR                                                           
429844                             0.0                         1.0   
259562                             0.0                         1.0   
188975                             0.0                         1.0   
254199                             0.0                         0.0   
332686                             0.0                         0.0   
...                                ...                         ...   
191213                             0.0                         3.0   
383329                             0.0                         3.0   
177037                             0.0                         2.0   
358151                             0.0                         1.0   
277125                             0.0                         5.0   

            AMT_REQ_CREDIT_BUREAU_QRT  AMT_REQ_CREDIT_BUREAU_DAY  \
SK_ID_CURR                                                         
429844                            1.0                        0.0   
259562                            0.0                        0.0   
188975                            0.0                        0.0   
254199                            0.0                        0.0   
332686                            0.0                        0.0   
...                               ...                        ...   
191213                            0.0                        0.0   
383329                            1.0                        0.0   
177037                            0.0                        0.0   
358151                            0.0                        0.0   
277125                            0.0                        0.0   

            AMT_REQ_CREDIT_BUREAU_WEEK  NAME_TYPE_SUITE  \
SK_ID_CURR                                                
429844                             0.0    Unaccompanied   
259562                             0.0    Unaccompanied   
188975                             0.0  Spouse, partner   
254199                             0.0    Unaccompanied   
332686                             0.0    Unaccompanied   
...                                ...              ...   
191213                             0.0    Unaccompanied   
383329                             0.0    Unaccompanied   
177037                             0.0    Unaccompanied   
358151                             0.0    Unaccompanied   
277125                             0.0    Unaccompanied   

            OBS_30_CNT_SOCIAL_CIRCLE  DEF_30_CNT_SOCIAL_CIRCLE  \
SK_ID_CURR                                                       
429844                          10.0                       0.0   
259562                           1.0                       0.0   
188975                           4.0                       0.0   
254199                           2.0                       0.0   
332686                           0.0                       0.0   
...                              ...                       ...   
191213                           0.0                       0.0   
383329                           0.0                       0.0   
177037                           3.0                       1.0   
358151                           1.0                       0.0   
277125                           0.0                       0.0   

            OBS_60_CNT_SOCIAL_CIRCLE  DEF_60_CNT_SOCIAL_CIRCLE  EXT_SOURCE_2  \
SK_ID_CURR                                                                     
429844                          10.0                       0.0      0.671561   
259562                           1.0                       0.0      0.684803   
188975                           4.0                       0.0      0.163187   
254199                           2.0                       0.0      0.630240   
332686                           0.0                       0.0      0.565010   
...                              ...                       ...           ...   
191213                           0.0                       0.0      0.685879   
383329                           0.0                       0.0      0.527424   
177037                           3.0                       0.0      0.638523   
358151                           1.0                       0.0      0.726744   
277125                           0.0                       0.0      0.638629   

            AMT_GOODS_PRICE  AMT_ANNUITY  CNT_FAM_MEMBERS  FLAG_DOCUMENT_4  \
SK_ID_CURR                                                                   
429844             450000.0      22500.0              2.0                0   
259562             450000.0      26613.0              1.0                0   
188975             454500.0      21672.0              3.0                0   
254199             283500.0      12042.0              2.0                0   
332686             679500.0      23931.0              4.0                0   
...                     ...          ...              ...              ...   
191213             585000.0      29250.0              3.0                0   
383329             900000.0      94509.0              3.0                0   
177037             225000.0      26226.0              2.0                0   
358151             225000.0      16852.5              2.0                0   
277125             225000.0      14242.5              1.0                0   

            FLAG_DOCUMENT_7  FLAG_DOCUMENT_6  FLAG_DOCUMENT_5  \
SK_ID_CURR                                                      
429844                    0                0                0   
259562                    0                0                0   
188975                    0                0                0   
254199                    0                0                0   
332686                    0                0                0   
...                     ...              ...              ...   
191213                    0                0                0   
383329                    0                0                0   
177037                    0                0                0   
358151                    0                0                0   
277125                    0                0                0   

            FLAG_DOCUMENT_8  FLAG_DOCUMENT_12  DAYS_LAST_PHONE_CHANGE  \
SK_ID_CURR                                                              
429844                    0                 0                 -1623.0   
259562                    0                 0                 -2948.0   
188975                    0                 0                 -2064.0   
254199                    0                 0                    -1.0   
332686                    0                 0                  -602.0   
...                     ...               ...                     ...   
191213                    0                 0                 -2145.0   
383329                    0                 0                 -2879.0   
177037                    0                 0                  -686.0   
358151                    0                 0                 -1658.0   
277125                    0                 0                 -1807.0   

            FLAG_DOCUMENT_3  FLAG_DOCUMENT_2  FLAG_DOCUMENT_11  \
SK_ID_CURR                                                       
429844                    0                0                 0   
259562                    1                0                 0   
188975                    1                0                 0   
254199                    1                0                 0   
332686                    1                0                 0   
...                     ...              ...               ...   
191213                    0                0                 0   
383329                    1                0                 0   
177037                    1                0                 0   
358151                    1                0                 0   
277125                    1                0                 0   

            FLAG_DOCUMENT_21  FLAG_DOCUMENT_20  FLAG_DOCUMENT_19  \
SK_ID_CURR                                                         
429844                     0                 0                 0   
259562                     0                 0                 0   
188975                     0                 0                 0   
254199                     0                 0                 0   
332686                     0                 0                 0   
...                      ...               ...               ...   
191213                     0                 0                 0   
383329                     0                 0                 0   
177037                     0                 0                 0   
358151                     0                 0                 0   
277125                     0                 0                 0   

            FLAG_DOCUMENT_18  FLAG_DOCUMENT_17  FLAG_DOCUMENT_9  \
SK_ID_CURR                                                        
429844                     0                 0                0   
259562                     0                 0                0   
188975                     0                 0                0   
254199                     0                 0                0   
332686                     0                 0                0   
...                      ...               ...              ...   
191213                     0                 0                0   
383329                     0                 0                0   
177037                     0                 0                0   
358151                     0                 0                0   
277125                     0                 0                0   

            FLAG_DOCUMENT_16  FLAG_DOCUMENT_15  FLAG_DOCUMENT_14  \
SK_ID_CURR                                                         
429844                     0                 0                 0   
259562                     0                 0                 0   
188975                     0                 0                 0   
254199                     1                 0                 0   
332686                     0                 0                 0   
...                      ...               ...               ...   
191213                     0                 0                 0   
383329                     0                 0                 0   
177037                     0                 0                 0   
358151                     0                 0                 0   
277125                     0                 0                 0   

            FLAG_DOCUMENT_13  FLAG_DOCUMENT_10 NAME_CONTRACT_TYPE CODE_GENDER  \
SK_ID_CURR                                                                      
429844                     0                 0    Revolving loans           M   
259562                     0                 0         Cash loans           F   
188975                     0                 0         Cash loans           M   
254199                     0                 0         Cash loans           F   
332686                     0                 0         Cash loans           F   
...                      ...               ...                ...         ...   
191213                     0                 0    Revolving loans           F   
383329                     0                 0         Cash loans           M   
177037                     0                 0         Cash loans           F   
358151                     0                 0         Cash loans           F   
277125                     0                 0         Cash loans           F   

            FLAG_MOBIL FLAG_OWN_CAR FLAG_OWN_REALTY  CNT_CHILDREN  \
SK_ID_CURR                                                          
429844               1            N               Y             0   
259562               1            Y               N             0   
188975               1            N               Y             1   
254199               1            N               N             0   
332686               1            N               N             2   
...                ...          ...             ...           ...   
191213               1            Y               Y             1   
383329               1            Y               Y             1   
177037               1            N               N             0   
358151               1            N               Y             0   
277125               1            N               Y             0   

            AMT_INCOME_TOTAL  AMT_CREDIT      NAME_INCOME_TYPE  \
SK_ID_CURR                                                       
429844              247500.0    450000.0  Commercial associate   
259562               90000.0    521280.0               Working   
188975               81000.0    508495.5             Pensioner   
254199              144000.0    433188.0  Commercial associate   
332686              225000.0    814041.0               Working   
...                      ...         ...                   ...   
191213              202500.0    585000.0  Commercial associate   
383329              225000.0    900000.0               Working   
177037              135000.0    254700.0             Pensioner   
358151              157500.0    343800.0               Working   
277125               76500.0    269550.0               Working   

                      NAME_EDUCATION_TYPE    NAME_FAMILY_STATUS  \
SK_ID_CURR                                                        
429844      Secondary / secondary special               Married   
259562      Secondary / secondary special  Single / not married   
188975                    Lower secondary               Married   
254199      Secondary / secondary special               Married   
332686                   Higher education               Married   
...                                   ...                   ...   
191213      Secondary / secondary special        Civil marriage   
383329      Secondary / secondary special               Married   
177037                   Higher education               Married   
358151      Secondary / secondary special        Civil marriage   
277125      Secondary / secondary special                 Widow   

            NAME_HOUSING_TYPE  REGION_POPULATION_RELATIVE  DAYS_BIRTH  \
SK_ID_CURR                                                              
429844      House / apartment                    0.046220      -19807   
259562      House / apartment                    0.018634      -16262   
188975      House / apartment                    0.031329      -11388   
254199      House / apartment                    0.035792      -19922   
332686      House / apartment                    0.025164      -14371   
...                       ...                         ...         ...   
191213      House / apartment                    0.028663      -20227   
383329      House / apartment                    0.025164      -17586   
177037      House / apartment                    0.030755      -22541   
358151      House / apartment                    0.018850       -9328   
277125      House / apartment                    0.018634      -14036   

            DAYS_EMPLOYED  DAYS_REGISTRATION  DAYS_ID_PUBLISH  FLAG_EMP_PHONE  \
SK_ID_CURR                                                                      
429844               -111            -2222.0            -3368               1   
259562              -4284            -5854.0            -4476               1   
188975             365243            -4881.0            -3940               0   
254199              -9018             -293.0            -3123               1   
332686               -785            -1581.0            -3679               1   
...                   ...                ...              ...             ...   
191213              -1973            -4085.0            -2791               1   
383329              -2545            -3463.0            -1147               1   
177037             365243            -7885.0            -3800               0   
358151               -959            -7701.0            -2022               1   
277125              -5056             -529.0            -4577               1   

                 ORGANIZATION_TYPE  FLAG_WORK_PHONE  FLAG_CONT_MOBILE  \
SK_ID_CURR                                                              
429844      Business Entity Type 2                0                 1   
259562               Self-employed                1                 1   
188975                         XNA                0                 1   
254199      Business Entity Type 2                0                 1   
332686                     Realtor                1                 1   
...                            ...              ...               ...   
191213      Business Entity Type 3                0                 1   
383329               Self-employed                0                 1   
177037                         XNA                0                 1   
358151      Business Entity Type 3                0                 1   
277125                  Government                1                 1   

            FLAG_PHONE  FLAG_EMAIL  REGION_RATING_CLIENT  \
SK_ID_CURR                                                 
429844               0           0                     1   
259562               0           0                     2   
188975               0           0                     2   
254199               1           0                     2   
332686               0           0                     2   
...                ...         ...                   ...   
191213               0           0                     2   
383329               0           0                     2   
177037               0           0                     2   
358151               0           0                     2   
277125               1           0                     2   

            REGION_RATING_CLIENT_W_CITY WEEKDAY_APPR_PROCESS_START  \
SK_ID_CURR                                                           
429844                                1                     SUNDAY   
259562                                2                    TUESDAY   
188975                                2                   THURSDAY   
254199                                2                    TUESDAY   
332686                                2                    TUESDAY   
...                                 ...                        ...   
191213                                2                    TUESDAY   
383329                                2                     MONDAY   
177037                                2                   THURSDAY   
358151                                2                  WEDNESDAY   
277125                                2                    TUESDAY   

            HOUR_APPR_PROCESS_START  REG_REGION_NOT_LIVE_REGION  \
SK_ID_CURR                                                        
429844                           16                           0   
259562                           16                           0   
188975                           15                           0   
254199                           19                           0   
332686                           10                           0   
...                             ...                         ...   
191213                           16                           0   
383329                           12                           0   
177037                           11                           0   
358151                           13                           0   
277125                           12                           0   

            REG_REGION_NOT_WORK_REGION  LIVE_REGION_NOT_WORK_REGION  \
SK_ID_CURR                                                            
429844                               0                            0   
259562                               0                            0   
188975                               0                            0   
254199                               0                            0   
332686                               0                            0   
...                                ...                          ...   
191213                               0                            0   
383329                               0                            0   
177037                               0                            0   
358151                               0                            0   
277125                               0                            0   

            REG_CITY_NOT_LIVE_CITY  REG_CITY_NOT_WORK_CITY  \
SK_ID_CURR                                                   
429844                           0                       1   
259562                           0                       0   
188975                           0                       0   
254199                           0                       0   
332686                           0                       0   
...                            ...                     ...   
191213                           0                       0   
383329                           0                       0   
177037                           0                       0   
358151                           0                       0   
277125                           0                       0   

            LIVE_CITY_NOT_WORK_CITY  TARGET  
SK_ID_CURR                                   
429844                            1       0  
259562                            0       0  
188975                            0       0  
254199                            0       0  
332686                            0       0  
...                             ...     ...  
191213                            0       0  
383329                            0       0  
177037                            0       0  
358151                            0       0  
277125                            0       0  

[246008 rows x 80 columns]
In [26]:
pd_loan_train['AMT_GOODS_PRICE'].fillna(pd_loan_train['AMT_GOODS_PRICE'].median(), inplace=True)
pd_loan_train['AMT_ANNUITY'].fillna(pd_loan_train['AMT_ANNUITY'].median(), inplace=True)
C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\3300765175.py:1: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



C:\Users\Guille\AppData\Local\Temp\ipykernel_19736\3300765175.py:2: FutureWarning:

A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



TRATAMIENTO VARIABLES CATEGORICAS

In [37]:
list_var_cat
Out[37]:
['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE']

OPCIÓN 2

In [38]:
pd_loan[list_var_continuous] = pd_loan[list_var_continuous].apply(lambda x: x.fillna(x.median()))
In [39]:
confusion_matrix = pd.crosstab(pd_loan_train["TARGET"], pd_loan_train["TARGET"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
TARGET       0      1
TARGET               
0       226148      0
1            0  19860
Out[39]:
0.9999726127135284
In [40]:
confusion_matrix = pd.crosstab(pd_loan_train["TARGET"], pd_loan_train["TARGET"])
cramers_v(confusion_matrix.values)
Out[40]:
0.9999726127135284
In [41]:
confusion_matrix = pd.crosstab(pd_loan_train["TARGET"], pd_loan_train["OWN_CAR_AGE"])
cramers_v(confusion_matrix.values)
Out[41]:
0.06001203138415476
In [42]:
pd_loan_train[list_var_cat] = pd_loan_train[list_var_cat].astype("object").fillna("SIN VALOR").astype("category")
pd_loan_test[list_var_cat] = pd_loan_test[list_var_cat].astype("object").fillna("SIN VALOR").astype("category")

print(pd_loan_test)
           NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  \
SK_ID_CURR                                                               
112958             Cash loans           F            N               N   
238225             Cash loans           F            N               Y   
269335             Cash loans           F            N               Y   
450836             Cash loans           M            Y               Y   
340566        Revolving loans           F            Y               Y   
...                       ...         ...          ...             ...   
235730             Cash loans           F            N               N   
367911             Cash loans           M            Y               N   
302585             Cash loans           M            Y               N   
102961             Cash loans           F            N               Y   
125815             Cash loans           M            N               Y   

            CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
SK_ID_CURR                                                            
112958                 0           92592.0    521280.0      19984.5   
238225                 1           81000.0    238500.0      13819.5   
269335                 0           81000.0    276277.5      13419.0   
450836                 0          112500.0   1058197.5      38137.5   
340566                 0           90000.0    495000.0      24750.0   
...                  ...               ...         ...          ...   
235730                 1          270000.0   1711368.0      70758.0   
367911                 0          112500.0    571446.0      16506.0   
302585                 0          171000.0    706410.0      67203.0   
102961                 0           72000.0   1129500.0      31059.0   
125815                 0          121500.0    630000.0      32296.5   

            AMT_GOODS_PRICE NAME_TYPE_SUITE      NAME_INCOME_TYPE  \
SK_ID_CURR                                                          
112958             450000.0   Unaccompanied  Commercial associate   
238225             238500.0   Unaccompanied  Commercial associate   
269335             238500.0   Unaccompanied             Pensioner   
450836             913500.0          Family             Pensioner   
340566             495000.0   Unaccompanied               Working   
...                     ...             ...                   ...   
235730            1597500.0   Unaccompanied  Commercial associate   
367911             477000.0   Unaccompanied               Working   
302585             679500.0   Unaccompanied  Commercial associate   
102961            1129500.0   Unaccompanied             Pensioner   
125815             630000.0   Unaccompanied               Working   

                      NAME_EDUCATION_TYPE    NAME_FAMILY_STATUS  \
SK_ID_CURR                                                        
112958                   Higher education               Married   
238225      Secondary / secondary special               Married   
269335      Secondary / secondary special  Single / not married   
450836                   Higher education               Married   
340566      Secondary / secondary special               Married   
...                                   ...                   ...   
235730      Secondary / secondary special               Married   
367911      Secondary / secondary special               Married   
302585                   Higher education             Separated   
102961                   Higher education               Married   
125815                  Incomplete higher  Single / not married   

            NAME_HOUSING_TYPE  REGION_POPULATION_RELATIVE  DAYS_BIRTH  \
SK_ID_CURR                                                              
112958      House / apartment                    0.035792      -19647   
238225      House / apartment                    0.018209      -14789   
269335      House / apartment                    0.046220      -21913   
450836      House / apartment                    0.035792      -15868   
340566      House / apartment                    0.018850      -13855   
...                       ...                         ...         ...   
235730      House / apartment                    0.072508      -10263   
367911      House / apartment                    0.022625      -19451   
302585       Office apartment                    0.019101      -16577   
102961      House / apartment                    0.026392      -20665   
125815      House / apartment                    0.018029       -8156   

            DAYS_EMPLOYED  DAYS_REGISTRATION  DAYS_ID_PUBLISH  OWN_CAR_AGE  \
SK_ID_CURR                                                                   
112958              -1240            -7902.0            -3208          NaN   
238225              -7774            -1392.0             -135          NaN   
269335             365243            -8461.0            -4290          NaN   
450836             365243            -9291.0            -4629         19.0   
340566              -2816            -6911.0            -4996          6.0   
...                   ...                ...              ...          ...   
235730              -2904            -5059.0            -1356          NaN   
367911               -296            -5586.0            -2992          1.0   
302585              -2067              -70.0              -76          1.0   
102961             365243            -4180.0            -4224          NaN   
125815              -1494            -3034.0             -844          NaN   

            FLAG_MOBIL  FLAG_EMP_PHONE  FLAG_WORK_PHONE  FLAG_CONT_MOBILE  \
SK_ID_CURR                                                                  
112958               1               1                1                 1   
238225               1               1                0                 1   
269335               1               0                0                 1   
450836               1               0                0                 1   
340566               1               1                1                 1   
...                ...             ...              ...               ...   
235730               1               1                0                 1   
367911               1               1                0                 1   
302585               1               1                0                 1   
102961               1               0                0                 1   
125815               1               1                0                 1   

            FLAG_PHONE  FLAG_EMAIL        OCCUPATION_TYPE  CNT_FAM_MEMBERS  \
SK_ID_CURR                                                                   
112958               0           0              SIN VALOR              2.0   
238225               1           0             Core staff              3.0   
269335               0           0              SIN VALOR              1.0   
450836               0           0              SIN VALOR              2.0   
340566               1           1             Core staff              2.0   
...                ...         ...                    ...              ...   
235730               0           1  High skill tech staff              3.0   
367911               1           0               Laborers              2.0   
302585               0           0              SIN VALOR              1.0   
102961               0           0              SIN VALOR              2.0   
125815               0           0               Managers              1.0   

            REGION_RATING_CLIENT  REGION_RATING_CLIENT_W_CITY  \
SK_ID_CURR                                                      
112958                         2                            2   
238225                         3                            3   
269335                         1                            1   
450836                         2                            2   
340566                         2                            2   
...                          ...                          ...   
235730                         1                            1   
367911                         2                            2   
302585                         2                            2   
102961                         2                            2   
125815                         3                            3   

           WEEKDAY_APPR_PROCESS_START  HOUR_APPR_PROCESS_START  \
SK_ID_CURR                                                       
112958                         FRIDAY                       12   
238225                       THURSDAY                       12   
269335                        TUESDAY                       10   
450836                       THURSDAY                       11   
340566                      WEDNESDAY                       12   
...                               ...                      ...   
235730                         MONDAY                       12   
367911                       THURSDAY                       10   
302585                         FRIDAY                        9   
102961                       THURSDAY                       11   
125815                         FRIDAY                        5   

            REG_REGION_NOT_LIVE_REGION  REG_REGION_NOT_WORK_REGION  \
SK_ID_CURR                                                           
112958                               0                           0   
238225                               0                           0   
269335                               0                           0   
450836                               0                           0   
340566                               0                           0   
...                                ...                         ...   
235730                               0                           0   
367911                               0                           0   
302585                               0                           0   
102961                               0                           0   
125815                               0                           0   

            LIVE_REGION_NOT_WORK_REGION  REG_CITY_NOT_LIVE_CITY  \
SK_ID_CURR                                                        
112958                                0                       0   
238225                                0                       0   
269335                                0                       0   
450836                                0                       0   
340566                                0                       0   
...                                 ...                     ...   
235730                                0                       0   
367911                                0                       0   
302585                                0                       0   
102961                                0                       0   
125815                                0                       0   

            REG_CITY_NOT_WORK_CITY  LIVE_CITY_NOT_WORK_CITY  \
SK_ID_CURR                                                    
112958                           0                        0   
238225                           1                        1   
269335                           0                        0   
450836                           0                        0   
340566                           1                        1   
...                            ...                      ...   
235730                           0                        0   
367911                           0                        0   
302585                           0                        0   
102961                           0                        0   
125815                           1                        1   

                 ORGANIZATION_TYPE  EXT_SOURCE_1  EXT_SOURCE_2  EXT_SOURCE_3  \
SK_ID_CURR                                                                     
112958                      School           NaN      0.673949      0.694093   
238225           Transport: type 2           NaN      0.626277           NaN   
269335                         XNA           NaN      0.733055      0.511892   
450836                         XNA      0.338143      0.566085      0.349055   
340566                  Government      0.573164      0.200526      0.144648   
...                            ...           ...           ...           ...   
235730      Business Entity Type 3      0.648827      0.742543           NaN   
367911      Business Entity Type 3           NaN      0.573798      0.597192   
302585                       Other      0.446338      0.702760      0.710674   
102961                         XNA           NaN      0.685977      0.723837   
125815                       Other      0.163182      0.408982      0.474051   

            APARTMENTS_AVG  BASEMENTAREA_AVG  YEARS_BEGINEXPLUATATION_AVG  \
SK_ID_CURR                                                                  
112958              0.0629            0.0520                       0.9796   
238225                 NaN               NaN                          NaN   
269335                 NaN               NaN                          NaN   
450836                 NaN               NaN                          NaN   
340566                 NaN               NaN                          NaN   
...                    ...               ...                          ...   
235730              0.1017            0.0924                       0.9776   
367911              0.1031               NaN                       0.9806   
302585                 NaN               NaN                          NaN   
102961              0.1639               NaN                       0.9940   
125815              0.0825               NaN                       0.9811   

            YEARS_BUILD_AVG  COMMONAREA_AVG  ELEVATORS_AVG  ENTRANCES_AVG  \
SK_ID_CURR                                                                  
112958               0.7212           0.008         0.0000         0.1379   
238225                  NaN             NaN            NaN            NaN   
269335                  NaN             NaN            NaN            NaN   
450836                  NaN             NaN            NaN            NaN   
340566                  NaN             NaN            NaN            NaN   
...                     ...             ...            ...            ...   
235730               0.6940           0.000         0.0132         0.1434   
367911                  NaN             NaN            NaN         0.2069   
302585                  NaN             NaN            NaN            NaN   
102961                  NaN             NaN         0.0800         0.0345   
125815               0.7416             NaN         0.0000         0.1379   

            FLOORSMAX_AVG  FLOORSMIN_AVG  LANDAREA_AVG  LIVINGAPARTMENTS_AVG  \
SK_ID_CURR                                                                     
112958             0.1667         0.2083        0.0438                0.0513   
238225                NaN            NaN           NaN                   NaN   
269335                NaN            NaN           NaN                   NaN   
450836                NaN            NaN           NaN                   NaN   
340566                NaN            NaN           NaN                   NaN   
...                   ...            ...           ...                   ...   
235730             0.2429         0.0417        0.0000                0.0828   
367911             0.1667            NaN           NaN                   NaN   
302585                NaN            NaN           NaN                   NaN   
102961             0.6250            NaN           NaN                   NaN   
125815             0.1667         0.1667        0.0110                   NaN   

            LIVINGAREA_AVG  NONLIVINGAPARTMENTS_AVG  NONLIVINGAREA_AVG  \
SK_ID_CURR                                                               
112958              0.0550                   0.0000             0.0000   
238225                 NaN                      NaN                NaN   
269335                 NaN                      NaN                NaN   
450836                 NaN                      NaN                NaN   
340566                 NaN                      NaN                NaN   
...                    ...                      ...                ...   
235730              0.0906                   0.0006             0.0001   
367911              0.0914                      NaN                NaN   
302585                 NaN                      NaN                NaN   
102961              0.1354                      NaN             0.0163   
125815              0.0518                      NaN             0.0922   

            APARTMENTS_MODE  BASEMENTAREA_MODE  YEARS_BEGINEXPLUATATION_MODE  \
SK_ID_CURR                                                                     
112958               0.0641              0.054                        0.9796   
238225                  NaN                NaN                           NaN   
269335                  NaN                NaN                           NaN   
450836                  NaN                NaN                           NaN   
340566                  NaN                NaN                           NaN   
...                     ...                ...                           ...   
235730               0.1050              0.108                        0.9762   
367911               0.1050                NaN                        0.9806   
302585                  NaN                NaN                           NaN   
102961               0.1670                NaN                        0.9940   
125815               0.0840                NaN                        0.9811   

            YEARS_BUILD_MODE  COMMONAREA_MODE  ELEVATORS_MODE  ENTRANCES_MODE  \
SK_ID_CURR                                                                      
112958                0.7321           0.0081          0.0000          0.1379   
238225                   NaN              NaN             NaN             NaN   
269335                   NaN              NaN             NaN             NaN   
450836                   NaN              NaN             NaN             NaN   
340566                   NaN              NaN             NaN             NaN   
...                      ...              ...             ...             ...   
235730                0.6864           0.0000          0.0000          0.1724   
367911                   NaN              NaN             NaN          0.2069   
302585                   NaN              NaN             NaN             NaN   
102961                   NaN              NaN          0.0806          0.0345   
125815                0.7517              NaN          0.0000          0.1379   

            FLOORSMAX_MODE  FLOORSMIN_MODE  LANDAREA_MODE  \
SK_ID_CURR                                                  
112958              0.1667          0.2083         0.0448   
238225                 NaN             NaN            NaN   
269335                 NaN             NaN            NaN   
450836                 NaN             NaN            NaN   
340566                 NaN             NaN            NaN   
...                    ...             ...            ...   
235730              0.1667          0.0417         0.0000   
367911              0.1667             NaN            NaN   
302585                 NaN             NaN            NaN   
102961              0.6250             NaN            NaN   
125815              0.1667          0.1667         0.0113   

            LIVINGAPARTMENTS_MODE  LIVINGAREA_MODE  NONLIVINGAPARTMENTS_MODE  \
SK_ID_CURR                                                                     
112958                     0.0560           0.0573                       0.0   
238225                        NaN              NaN                       NaN   
269335                        NaN              NaN                       NaN   
450836                        NaN              NaN                       NaN   
340566                        NaN              NaN                       NaN   
...                           ...              ...                       ...   
235730                     0.0918           0.0748                       0.0   
367911                        NaN           0.0952                       NaN   
302585                        NaN              NaN                       NaN   
102961                        NaN           0.1410                       NaN   
125815                        NaN           0.0540                       NaN   

            NONLIVINGAREA_MODE  APARTMENTS_MEDI  BASEMENTAREA_MEDI  \
SK_ID_CURR                                                           
112958                  0.0000           0.0635              0.052   
238225                     NaN              NaN                NaN   
269335                     NaN              NaN                NaN   
450836                     NaN              NaN                NaN   
340566                     NaN              NaN                NaN   
...                        ...              ...                ...   
235730                  0.0000           0.1041              0.104   
367911                     NaN           0.1041                NaN   
302585                     NaN              NaN                NaN   
102961                  0.0172           0.1655                NaN   
125815                  0.0976           0.0833                NaN   

            YEARS_BEGINEXPLUATATION_MEDI  YEARS_BUILD_MEDI  COMMONAREA_MEDI  \
SK_ID_CURR                                                                    
112958                            0.9796            0.7249            0.008   
238225                               NaN               NaN              NaN   
269335                               NaN               NaN              NaN   
450836                               NaN               NaN              NaN   
340566                               NaN               NaN              NaN   
...                                  ...               ...              ...   
235730                            0.9762            0.6780            0.000   
367911                            0.9806               NaN              NaN   
302585                               NaN               NaN              NaN   
102961                            0.9940               NaN              NaN   
125815                            0.9811            0.7451              NaN   

            ELEVATORS_MEDI  ENTRANCES_MEDI  FLOORSMAX_MEDI  FLOORSMIN_MEDI  \
SK_ID_CURR                                                                   
112958                0.00          0.1379          0.1667          0.2083   
238225                 NaN             NaN             NaN             NaN   
269335                 NaN             NaN             NaN             NaN   
450836                 NaN             NaN             NaN             NaN   
340566                 NaN             NaN             NaN             NaN   
...                    ...             ...             ...             ...   
235730                0.00          0.1724          0.1667          0.0417   
367911                 NaN          0.2069          0.1667             NaN   
302585                 NaN             NaN             NaN             NaN   
102961                0.08          0.0345          0.6250             NaN   
125815                0.00          0.1379          0.1667          0.1667   

            LANDAREA_MEDI  LIVINGAPARTMENTS_MEDI  LIVINGAREA_MEDI  \
SK_ID_CURR                                                          
112958             0.0446                 0.0522           0.0560   
238225                NaN                    NaN              NaN   
269335                NaN                    NaN              NaN   
450836                NaN                    NaN              NaN   
340566                NaN                    NaN              NaN   
...                   ...                    ...              ...   
235730             0.0000                 0.0855           0.0918   
367911                NaN                    NaN           0.0930   
302585                NaN                    NaN              NaN   
102961                NaN                    NaN           0.1378   
125815             0.0112                    NaN           0.0528   

            NONLIVINGAPARTMENTS_MEDI  NONLIVINGAREA_MEDI  \
SK_ID_CURR                                                 
112958                           0.0              0.0000   
238225                           NaN                 NaN   
269335                           NaN                 NaN   
450836                           NaN                 NaN   
340566                           NaN                 NaN   
...                              ...                 ...   
235730                           0.0              0.0000   
367911                           NaN                 NaN   
302585                           NaN                 NaN   
102961                           NaN              0.0166   
125815                           NaN              0.0941   

               FONDKAPREMONT_MODE  HOUSETYPE_MODE  TOTALAREA_MODE  \
SK_ID_CURR                                                          
112958           reg oper account  block of flats          0.0433   
238225                  SIN VALOR       SIN VALOR             NaN   
269335                  SIN VALOR       SIN VALOR             NaN   
450836                  SIN VALOR       SIN VALOR             NaN   
340566                  SIN VALOR       SIN VALOR             NaN   
...                           ...             ...             ...   
235730           reg oper account  block of flats          0.0565   
367911                  SIN VALOR  block of flats          0.0797   
302585                  SIN VALOR       SIN VALOR             NaN   
102961                  SIN VALOR  block of flats          0.1100   
125815      reg oper spec account  block of flats          0.0608   

           WALLSMATERIAL_MODE EMERGENCYSTATE_MODE  OBS_30_CNT_SOCIAL_CIRCLE  \
SK_ID_CURR                                                                    
112958                  Panel                  No                       3.0   
238225              SIN VALOR           SIN VALOR                       1.0   
269335              SIN VALOR           SIN VALOR                       1.0   
450836              SIN VALOR           SIN VALOR                       5.0   
340566              SIN VALOR           SIN VALOR                       0.0   
...                       ...                 ...                       ...   
235730                  Block                  No                       0.0   
367911              SIN VALOR                  No                       0.0   
302585              SIN VALOR           SIN VALOR                       0.0   
102961             Monolithic                  No                       7.0   
125815                  Panel                  No                       4.0   

            DEF_30_CNT_SOCIAL_CIRCLE  OBS_60_CNT_SOCIAL_CIRCLE  \
SK_ID_CURR                                                       
112958                           0.0                       3.0   
238225                           0.0                       1.0   
269335                           0.0                       1.0   
450836                           0.0                       5.0   
340566                           0.0                       0.0   
...                              ...                       ...   
235730                           0.0                       0.0   
367911                           0.0                       0.0   
302585                           0.0                       0.0   
102961                           0.0                       7.0   
125815                           1.0                       4.0   

            DEF_60_CNT_SOCIAL_CIRCLE  DAYS_LAST_PHONE_CHANGE  FLAG_DOCUMENT_2  \
SK_ID_CURR                                                                      
112958                           0.0                 -1103.0                0   
238225                           0.0                 -3107.0                0   
269335                           0.0                  -894.0                0   
450836                           0.0                  -849.0                0   
340566                           0.0                 -1563.0                0   
...                              ...                     ...              ...   
235730                           0.0                 -1902.0                0   
367911                           0.0                 -2635.0                0   
302585                           0.0                  -517.0                0   
102961                           0.0                     0.0                0   
125815                           1.0                 -1197.0                0   

            FLAG_DOCUMENT_3  FLAG_DOCUMENT_4  FLAG_DOCUMENT_5  \
SK_ID_CURR                                                      
112958                    0                0                0   
238225                    1                0                0   
269335                    0                0                0   
450836                    1                0                0   
340566                    0                0                0   
...                     ...              ...              ...   
235730                    0                0                0   
367911                    1                0                0   
302585                    1                0                0   
102961                    0                0                0   
125815                    1                0                0   

            FLAG_DOCUMENT_6  FLAG_DOCUMENT_7  FLAG_DOCUMENT_8  \
SK_ID_CURR                                                      
112958                    0                0                0   
238225                    0                0                0   
269335                    1                0                0   
450836                    0                0                0   
340566                    0                0                0   
...                     ...              ...              ...   
235730                    0                0                1   
367911                    0                0                0   
302585                    0                0                0   
102961                    1                0                0   
125815                    0                0                0   

            FLAG_DOCUMENT_9  FLAG_DOCUMENT_10  FLAG_DOCUMENT_11  \
SK_ID_CURR                                                        
112958                    0                 0                 0   
238225                    0                 0                 0   
269335                    0                 0                 0   
450836                    0                 0                 0   
340566                    0                 0                 0   
...                     ...               ...               ...   
235730                    0                 0                 0   
367911                    0                 0                 0   
302585                    0                 0                 0   
102961                    0                 0                 0   
125815                    0                 0                 0   

            FLAG_DOCUMENT_12  FLAG_DOCUMENT_13  FLAG_DOCUMENT_14  \
SK_ID_CURR                                                         
112958                     0                 0                 0   
238225                     0                 0                 0   
269335                     0                 0                 0   
450836                     0                 0                 0   
340566                     0                 0                 0   
...                      ...               ...               ...   
235730                     0                 0                 0   
367911                     0                 1                 0   
302585                     0                 0                 0   
102961                     0                 0                 0   
125815                     0                 0                 0   

            FLAG_DOCUMENT_15  FLAG_DOCUMENT_16  FLAG_DOCUMENT_17  \
SK_ID_CURR                                                         
112958                     0                 0                 0   
238225                     0                 0                 0   
269335                     0                 0                 0   
450836                     0                 0                 0   
340566                     0                 0                 0   
...                      ...               ...               ...   
235730                     0                 0                 0   
367911                     0                 0                 0   
302585                     0                 0                 0   
102961                     0                 0                 0   
125815                     0                 0                 0   

            FLAG_DOCUMENT_18  FLAG_DOCUMENT_19  FLAG_DOCUMENT_20  \
SK_ID_CURR                                                         
112958                     1                 0                 0   
238225                     0                 0                 0   
269335                     0                 0                 0   
450836                     0                 0                 0   
340566                     0                 0                 0   
...                      ...               ...               ...   
235730                     0                 0                 0   
367911                     0                 0                 0   
302585                     0                 0                 0   
102961                     0                 0                 0   
125815                     0                 0                 0   

            FLAG_DOCUMENT_21  AMT_REQ_CREDIT_BUREAU_HOUR  \
SK_ID_CURR                                                 
112958                     0                         0.0   
238225                     0                         0.0   
269335                     0                         0.0   
450836                     0                         0.0   
340566                     0                         0.0   
...                      ...                         ...   
235730                     0                         0.0   
367911                     0                         0.0   
302585                     0                         0.0   
102961                     0                         0.0   
125815                     0                         0.0   

            AMT_REQ_CREDIT_BUREAU_DAY  AMT_REQ_CREDIT_BUREAU_WEEK  \
SK_ID_CURR                                                          
112958                            0.0                         0.0   
238225                            0.0                         0.0   
269335                            0.0                         0.0   
450836                            0.0                         0.0   
340566                            0.0                         0.0   
...                               ...                         ...   
235730                            0.0                         0.0   
367911                            0.0                         0.0   
302585                            0.0                         0.0   
102961                            0.0                         0.0   
125815                            0.0                         0.0   

            AMT_REQ_CREDIT_BUREAU_MON  AMT_REQ_CREDIT_BUREAU_QRT  \
SK_ID_CURR                                                         
112958                            0.0                        0.0   
238225                            0.0                        1.0   
269335                            0.0                        0.0   
450836                            0.0                        0.0   
340566                            1.0                        0.0   
...                               ...                        ...   
235730                            1.0                        0.0   
367911                            0.0                        0.0   
302585                            0.0                        0.0   
102961                            2.0                        1.0   
125815                            0.0                        0.0   

            AMT_REQ_CREDIT_BUREAU_YEAR  TARGET  
SK_ID_CURR                                      
112958                             1.0       0  
238225                             0.0       0  
269335                             2.0       0  
450836                             2.0       0  
340566                             5.0       0  
...                                ...     ...  
235730                             2.0       0  
367911                             0.0       0  
302585                             1.0       0  
102961                             3.0       0  
125815                             1.0       0  

[61503 rows x 121 columns]
In [43]:
pd_loan_train.to_csv("../data/train_pd_data_preprocessing_missing_outlier.csv")
pd_loan_test.to_csv("../data/test_pd_data_preprocessing_missing_outlier.csv")